diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h index 0d14709fe9f9fc4e997d6ae2d53c7cca967de618..85918f5234193fbe288b5dddd744d48343d535b4 100644 --- a/llvm/include/llvm/IR/Attributes.h +++ b/llvm/include/llvm/IR/Attributes.h @@ -105,6 +105,11 @@ public: UWTable, ///< Function must be in a unwind table ZExt, ///< Zero extended before/after call + // VISC Attributes + In, ///< pointer to read only memory + Out, ///< pointer to write only memory + InOut, ///< pointer to read/write memory + EndAttrKinds ///< Sentinal value useful for loops }; private: diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index 82d9975ca68d8897aad66d5ee1d6d1f453d6bc40..cce8b15fe2c678bce50b4034a3199546d5af8780 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -596,6 +596,11 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(uwtable); KEYWORD(zeroext); + // VISC Parameter Attributes + KEYWORD(in); + KEYWORD(out); + KEYWORD(inout); + KEYWORD(type); KEYWORD(opaque); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index b22d251f9e9ae3b3f7c967427d9ec8a86db74802..bec6f71083c077c8036d08a38d9ea930036546ad 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -947,6 +947,10 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B, case lltok::kw_nocapture: case lltok::kw_returned: case lltok::kw_sret: + // VISC parameter only attributes + case lltok::kw_in: + case lltok::kw_out: + case lltok::kw_inout: HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute on a function"); @@ -1162,6 +1166,10 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) { case lltok::kw_signext: B.addAttribute(Attribute::SExt); break; case lltok::kw_sret: B.addAttribute(Attribute::StructRet); break; case lltok::kw_zeroext: B.addAttribute(Attribute::ZExt); break; + // VISC parameter attributes + case lltok::kw_in: B.addAttribute(Attribute::In); break; + case lltok::kw_out: B.addAttribute(Attribute::Out); break; + case lltok::kw_inout: B.addAttribute(Attribute::InOut); break; case lltok::kw_alignstack: case lltok::kw_alwaysinline: @@ -1218,6 +1226,10 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) { case lltok::kw_nocapture: case lltok::kw_returned: case lltok::kw_sret: + // VISC Parameter only attributes + case lltok::kw_in: + case lltok::kw_out: + case lltok::kw_inout: HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute"); break; diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h index e889a2bfd0e7e8a5a763fa6da8980e78ea73c149..817b74def0c129a267f8ce0024fb74c00739c82b 100644 --- a/llvm/lib/AsmParser/LLToken.h +++ b/llvm/lib/AsmParser/LLToken.h @@ -127,6 +127,9 @@ namespace lltok { kw_uwtable, kw_zeroext, + // VISC parameter attributes + kw_in, kw_out, kw_inout, + kw_type, kw_opaque, diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp index e48ebb133527f46e4a095b98b1b41e06127a5c7b..c6142f8ea13472046386326cc0cc1c03647e5d11 100644 --- a/llvm/lib/IR/Attributes.cpp +++ b/llvm/lib/IR/Attributes.cpp @@ -263,6 +263,14 @@ std::string Attribute::getAsString(bool InAttrGrp) const { return Result; } + // VISC attributes for arguments + if (hasAttribute(Attribute::In)) + return "in"; + if (hasAttribute(Attribute::Out)) + return "out"; + if (hasAttribute(Attribute::InOut)) + return "inout"; + llvm_unreachable("Unknown attribute"); } @@ -399,6 +407,11 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) { case Attribute::NoBuiltin: return 1ULL << 38; case Attribute::Returned: return 1ULL << 39; case Attribute::Cold: return 1ULL << 40; + + // VISC Attributes + case Attribute::In: return 1ULL << 41; + case Attribute::Out: return 1ULL << 42; + case Attribute::InOut: return 1ULL << 43; } llvm_unreachable("Unsupported attribute type"); } diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp index 2961a966afb63e48fa80e241e853d31264abe6b2..fafc892ea7b49139e0846934dc257e035a1ca950 100644 --- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp +++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp @@ -26,6 +26,7 @@ #include "llvm/Linker.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/FileSystem.h" +#include "llvm/IR/Attributes.h" #include <sstream> @@ -37,6 +38,7 @@ using namespace builddfg; namespace { // Helper function declarations + static bool hasAttribute(Function*, unsigned, Attribute::AttrKind); static std::string getPTXFilename(const Module&); static std::string getFilenameFromModule(const Module& M); static void changeDataLayout(Module &); @@ -66,6 +68,18 @@ namespace { }; + // Helper class to maintain the tuple of host pointer, device pointer and size + // in bytes. Would have preferred to use tuple but support not yet available + class OutputPtr { + public: + OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes) + : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {} + + Value* h_ptr; + Value* d_ptr; + Value* bytes; + }; + // Visitor for Code generation traversal (tree traversal for now) class Kernel { public: @@ -107,9 +121,8 @@ namespace { Function* llvm_visc_ptx_launch; Function* llvm_visc_ptx_wait; Function* llvm_visc_ptx_initContext; - Function* llvm_visc_ptx_input_scalar; - Function* llvm_visc_ptx_input_ptr; - Function* llvm_visc_ptx_output_ptr; + Function* llvm_visc_ptx_argument_scalar; + Function* llvm_visc_ptx_argument_ptr; Function* llvm_visc_ptx_getOutput; Function* llvm_visc_ptx_executeNode; @@ -227,17 +240,13 @@ namespace { runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType())); DEBUG(errs() << *llvm_visc_ptx_initContext); - llvm_visc_ptx_input_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_input_scalar", - runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_input_scalar); + llvm_visc_ptx_argument_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_scalar", + runtimeModule->getFunction("llvm_visc_ptx_argument_scalar")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_argument_scalar); - llvm_visc_ptx_input_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_input_ptr", - runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_input_ptr); - - llvm_visc_ptx_output_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_output_ptr", - runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType())); - DEBUG(errs() << *llvm_visc_ptx_output_ptr); + llvm_visc_ptx_argument_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_ptr", + runtimeModule->getFunction("llvm_visc_ptx_argument_ptr")->getFunctionType())); + DEBUG(errs() << *llvm_visc_ptx_argument_ptr); llvm_visc_ptx_getOutput = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_getOutput", runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType())); @@ -302,7 +311,7 @@ namespace { // argument from argument list of this internal node Value* inputVal; if(SrcDF->isEntryNode()) { - inputVal = getArgumentAt(ParentF_X86, i); + inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition()); DEBUG(errs() << "Argument "<< i<< " = " << *inputVal << "\n"); } else { @@ -348,6 +357,10 @@ namespace { // function before. assert(N->getGenFunc() == NULL && "Code already generated for this node"); + // Useful values + Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1); + Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0); + // If kernel struct has not been initialized with kernel function, then fail assert(kernel != NULL && "No kernel found!!"); @@ -437,6 +450,8 @@ namespace { // Iterate over the required input edges of the node and use the visc-rt API // to set inputs DEBUG(errs() << "Iterate over input edges of node and insert visc api\n"); + + std::vector<OutputPtr> OutputPointers; for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) { Value* inputVal = getInValueAt(C, i, F_X86, RI); @@ -446,6 +461,19 @@ namespace { // type on target machine, but for pointers, the size of data would be the // next integer argument if(inputVal->getType()->isPointerTy()) { + // CheckAttribute + Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False; + Value* isInput = ((hasAttribute(CF, i, Attribute::Out)) + && !(hasAttribute(CF, i, Attribute::In)))? False : True; + + Argument* A = getArgumentAt(CF, i); + if(isOutput == True) { + errs() << *A << " is an OUTPUT argument\n"; + } + if(isInput == True) { + errs() << *A << " is an INPUT argument\n"; + } + Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal, Type::getInt8PtrTy(M.getContext()), inputVal->getName()+".i8ptr", @@ -457,10 +485,15 @@ namespace { Value* setInputArgs[] = {GraphID, inputValI8Ptr, ConstantInt::get(Type::getInt32Ty(M.getContext()),i), - inputSize + inputSize, + isInput, + isOutput }; - CallInst::Create(llvm_visc_ptx_input_ptr, - ArrayRef<Value*>(setInputArgs, 4), "", RI); + Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr, + ArrayRef<Value*>(setInputArgs, 6), "", RI); + // If this has out attribute, store the returned device pointer in + // memory to read device memory later + if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize)); } else { // Scalar Input // Store the scalar value on stack and then pass the pointer to its @@ -478,7 +511,7 @@ namespace { ConstantInt::get(Type::getInt32Ty(M.getContext()),i), ConstantExpr::getSizeOf(inputVal->getType()) }; - CallInst::Create(llvm_visc_ptx_input_scalar, + CallInst::Create(llvm_visc_ptx_argument_scalar, ArrayRef<Value*>(setInputArgs, 4), "", RI); } @@ -499,11 +532,14 @@ namespace { unsigned outputIndex = CF->getFunctionType()->getNumParams(); Value* outputSize = ConstantExpr::getSizeOf(OutputTy); Value* setOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex), - ConstantExpr::getSizeOf(OutputTy)}; + ConstantExpr::getSizeOf(OutputTy), + False, + True}; - CallInst* d_Output = CallInst::Create(llvm_visc_ptx_output_ptr, - ArrayRef<Value*>(setOutputArgs,3), + CallInst* d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr, + ArrayRef<Value*>(setOutputArgs, 6), "d_output."+CF->getName(), RI); @@ -558,17 +594,30 @@ namespace { RI); // Read Output Struct Value* GetOutputArgs[] = {GraphID, + Constant::getNullValue(Type::getInt8PtrTy(M.getContext())), d_Output, outputSize}; CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput, - ArrayRef<Value*>(GetOutputArgs, 3), + ArrayRef<Value*>(GetOutputArgs, 4), "h_output."+CF->getName()+".addr", RI); // Read each device pointer listed in output struct // Load the output struct CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI); Value* KernelOutput = new LoadInst(BI, "", RI); - for(unsigned i=0; i < OutputTy->getNumElements(); i++) { + + // Read all the pointer arguments which had side effects i.e., had out + // attribute + for(auto output: OutputPointers) { + errs() << "Read: " << *output.d_ptr << "\n"; + errs() << "\t To: " << *output.h_ptr << "\n"; + errs() << "\t #bytes: " << *output.bytes << "\n"; + Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes}; + CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput, + ArrayRef<Value*>(GetOutputArgs, 4), + "", RI); + } + /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) { Type* elemTy = OutputTy->getElementType(i); if(elemTy->isPointerTy()) { // Pointer type @@ -594,7 +643,8 @@ namespace { KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI); } - } + }*/ + // Prepare output KernelOutput->setName("output."+CF->getName()); OutputMap[C] = KernelOutput; @@ -1239,6 +1289,10 @@ namespace { * Helper functions * ******************************************************************************/ + // Find if argument has the given attribute + static bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) { + return F->getAttributes().hasAttribute(arg_index+1, AK); + } // Get generated PTX binary name static std::string getPTXFilename(const Module& M) { std::string moduleID = M.getModuleIdentifier(); diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp index 3cd64258af529c11937c3662ae3e7c8616cac9c1..80bc859e8049b557a2f2f1553b5fda5bc9b1de9e 100644 --- a/llvm/projects/visc-rt/visc-rt.cpp +++ b/llvm/projects/visc-rt/visc-rt.cpp @@ -3,6 +3,7 @@ #include <cstdio> #include <string> #include <CL/cl.h> +//#include "visc-rt.h" typedef struct { pthread_t threadID; @@ -35,8 +36,10 @@ void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) { extern "C" void llvm_visc_x86_wait(void* graphID) { + printf("Waiting for pthread to finish ...\n"); DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID; pthread_join(Context->threadID, NULL); + printf("\t... pthread Done!\n"); } extern "C" @@ -82,49 +85,42 @@ void llvm_visc_ptx_clearContext() { } extern "C" -void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) { - printf("Set Scalar Input. Argument Index = %d\n", arg_index); +void llvm_visc_ptx_argument_scalar(void* graphID, void* input, int arg_index, size_t size) { + printf("Set Scalar Input. Argument Index = %d, Size = %lu\n", arg_index, size); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input); checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument"); } -cl_mem C; extern "C" -void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) { - printf("Set Pointer Input. Argument Index = %d\n", arg_index); +void* llvm_visc_ptx_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) { + printf("Set Pointer Input. Argument Index = %d, Ptr = %p, Size = %lu\n", arg_index, input, size); + printf("\tInput: %d, Output: %d\n", isInput, isOutput); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode; - cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE | - CL_MEM_COPY_HOST_PTR, size, input, &errcode); + cl_mem_flags clFlags; + if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + else if(isInput) clFlags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + else if(isOutput) clFlags = CL_MEM_READ_WRITE; + else clFlags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR; + + if(input == NULL) { + printf("Found NULL input value!!"); + input = NULL; + } + cl_mem d_input = clCreateBuffer(Context->clGPUContext, clFlags, size, input, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input); - checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument"); - printf("Set Input: DevicePtr = 0x%x, Size = %d\n", d_input, size); - C = d_input; + checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument"); + printf("\tDevicePtr = %p, Size = %lu\n", d_input, size); return d_input; } extern "C" -void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) { - printf("Set Pointer Output. Argument Index = %d\n", arg_index); - DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; - cl_int errcode; - cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE, - size, NULL, &errcode); - checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device"); - errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output); - checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument"); - printf("Set Output: DevicePtr = 0x%x, Size = %d\n", d_output, size); - return d_output; -} -int count = 0; -extern "C" -void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) { - if(size > 1000) - d_output = (void*) C; - printf("Get Output: DevicePtr = 0x%x, Size = %d\n", d_output, size); - void* h_output = malloc(size); +void* llvm_visc_ptx_getOutput(void* graphID, void* h_output, void* d_output, size_t size) { + printf("Get Output: HostPtr = %p, DevicePtr = %p, Size = %lu\n", h_output, d_output, size); + if(h_output == NULL) + h_output = malloc(size); DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size, h_output, 0, NULL, NULL); @@ -136,21 +132,39 @@ extern "C" void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t* localWorkSize, const size_t* globalWorkSize) { - printf("Execute Node\n"); - DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; - cl_event* event; - // -------------- Just for testing ------ - if(globalWorkSize == NULL) { - // TODO: Remove this hack - size_t WorkSize[2] = {1024, 1024}; - cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, - Context->clKernel, workDim, NULL, WorkSize, localWorkSize, 0, NULL, event); - checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); - return event; + size_t GlobalWG[3]; + size_t LocalWG[3]; + printf("Execute Node: Number of Dimensions = %u\n", workDim); + + // OpenCL EnqeueNDRangeKernel function results in segementation fault if we + // directly use local and global work groups arguments. Hence, allocating it + // on stack and copying. + printf("Global Work Group: "); + for(unsigned i=0; i<workDim; i++) { + printf("%lu ", globalWorkSize[i]); + GlobalWG[i] = globalWorkSize[i]; + } + printf("\n"); + + // OpenCL allows local workgroup to be null. + if(localWorkSize != NULL) { + printf("Local Work Group: "); + for(unsigned i=0; i<workDim; i++) { + printf("%lu ", localWorkSize[i]); + LocalWG[i] = localWorkSize[i]; + } + printf("\n"); } + DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID; + // TODO: Would like to use event to ensure better scheduling of kernels. + // Currently passing the event paratemeter results in seg fault with + // clEnqueueNDRangeKernel. + cl_event* event; + printf("Enqueuing kernel: %p, %p, %d, %lu, %lu\n", Context->clCommandQue, Context->clKernel, workDim, GlobalWG[0], GlobalWG[1]); cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue, - Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event); + Context->clKernel, workDim, NULL, GlobalWG, NULL, 0, NULL, NULL); + printf("Enqueued kernel\n"); checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); return event; } @@ -262,6 +276,8 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { // get the list of GPU devices associated with context errcode = clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); + checkErr(errcode, CL_SUCCESS, "Failure to get context info length"); + cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); @@ -281,9 +297,11 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { &binaryStatus, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); + printf("Building kernel - %s from file %s\n", KernelName, FileName); errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL); // If build fails, get build log from device if(errcode != CL_SUCCESS) { + printf("Failure to build program\n"); size_t len = 0; errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0, NULL, &len); @@ -295,10 +313,8 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) { checkErr(errcode, CL_SUCCESS, "Failure to collect program build log"); printf("Device Build Log:\n%s\n", log); - - + exit(EXIT_FAILURE); } - checkErr(errcode, CL_SUCCESS, "Failure to build program"); Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode); checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll index d1c081571b57bc4787eae321d52fa45ee5ab7a88..c76076a8726fcd1ff9fa9d355b1a0153e9c64e16 100644 --- a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll +++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll @@ -12,6 +12,7 @@ target triple = "x86_64-unknown-linux-gnu" @custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1 @hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1 +@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1 @.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1 @.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1 @.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1 @@ -135,7 +136,7 @@ declare i32 @printf(i8* nocapture, ...) #1 ; --------------- VISC Intrinsics --------------- ; Return Type of VISC Compute Matrix Mul -%rtype = type {float*, i64} +%rtype = type {i64} %struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }> ; Function Attrs: nounwind @@ -182,7 +183,7 @@ declare void @llvm.visc.bind.output(i8*, i32, i32) ; ----------------- VISC intrinsics end ------------------ ; Function Attrs: nounwind uwtable -define %rtype @matrixMul(float* nocapture %A, i64 %bytes_A, float* nocapture %B, i64 %bytes_B, float* %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { +define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 { entry: ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0)) @@ -235,8 +236,7 @@ for.end: ; preds = %for.body, %entry store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0 ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0)) ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0)) - %.fca.0.insert = insertvalue %rtype undef, float* %C, 0 - %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i64 %bytes_C, 1 + %.fca.1.insert = insertvalue %rtype undef, i64 %bytes_C, 0 ret %rtype %.fca.1.insert } @@ -254,8 +254,7 @@ define %rtype @MatrixMulRoot(float* %h_A, i64 %bytes_A, float* %h_B, i64 %bytes_ call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m ; Bind Outputs - call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C - call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C + call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); bytes_C ret %rtype zeroinitializer } @@ -373,7 +372,7 @@ randomInit.exit41: ; preds = %for.body.i40 ; -------------------------------- Completed VISC Launch Call -------------------------------- %3 = extractvalue %rtype %out, 0 - %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %3) + %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2) %tobool = icmp eq i32 %call14, 0 br i1 %tobool, label %if.else, label %if.then