diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 0d14709fe9f9fc4e997d6ae2d53c7cca967de618..85918f5234193fbe288b5dddd744d48343d535b4 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -105,6 +105,11 @@ public:
     UWTable,               ///< Function must be in a unwind table
     ZExt,                  ///< Zero extended before/after call
 
+    // VISC Attributes
+    In,                    ///< pointer to read only memory
+    Out,                   ///< pointer to write only memory
+    InOut,                 ///< pointer to read/write memory
+
     EndAttrKinds           ///< Sentinal value useful for loops
   };
 private:
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 82d9975ca68d8897aad66d5ee1d6d1f453d6bc40..cce8b15fe2c678bce50b4034a3199546d5af8780 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -596,6 +596,11 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(uwtable);
   KEYWORD(zeroext);
 
+  // VISC Parameter Attributes
+  KEYWORD(in);
+  KEYWORD(out);
+  KEYWORD(inout);
+
   KEYWORD(type);
   KEYWORD(opaque);
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b22d251f9e9ae3b3f7c967427d9ec8a86db74802..bec6f71083c077c8036d08a38d9ea930036546ad 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -947,6 +947,10 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_nocapture:
     case lltok::kw_returned:
     case lltok::kw_sret:
+    // VISC parameter only attributes
+    case lltok::kw_in:
+    case lltok::kw_out:
+    case lltok::kw_inout:
       HaveError |=
         Error(Lex.getLoc(),
               "invalid use of parameter-only attribute on a function");
@@ -1162,6 +1166,10 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
     case lltok::kw_sret:            B.addAttribute(Attribute::StructRet); break;
     case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
+    // VISC parameter attributes
+    case lltok::kw_in:         B.addAttribute(Attribute::In); break;
+    case lltok::kw_out:        B.addAttribute(Attribute::Out); break;
+    case lltok::kw_inout:      B.addAttribute(Attribute::InOut); break;
 
     case lltok::kw_alignstack:
     case lltok::kw_alwaysinline:
@@ -1218,6 +1226,10 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_nocapture:
     case lltok::kw_returned:
     case lltok::kw_sret:
+    // VISC Parameter only attributes
+    case lltok::kw_in:
+    case lltok::kw_out:
+    case lltok::kw_inout:
       HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute");
       break;
 
diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index e889a2bfd0e7e8a5a763fa6da8980e78ea73c149..817b74def0c129a267f8ce0024fb74c00739c82b 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -127,6 +127,9 @@ namespace lltok {
     kw_uwtable,
     kw_zeroext,
 
+    // VISC parameter attributes
+    kw_in, kw_out, kw_inout,
+
     kw_type,
     kw_opaque,
 
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index e48ebb133527f46e4a095b98b1b41e06127a5c7b..c6142f8ea13472046386326cc0cc1c03647e5d11 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -263,6 +263,14 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return Result;
   }
 
+  // VISC attributes for arguments
+  if (hasAttribute(Attribute::In))
+    return "in";
+  if (hasAttribute(Attribute::Out))
+    return "out";
+  if (hasAttribute(Attribute::InOut))
+    return "inout";
+
   llvm_unreachable("Unknown attribute");
 }
 
@@ -399,6 +407,11 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::NoBuiltin:       return 1ULL << 38;
   case Attribute::Returned:        return 1ULL << 39;
   case Attribute::Cold:            return 1ULL << 40;
+
+  // VISC Attributes
+  case Attribute::In:                return 1ULL << 41;
+  case Attribute::Out:               return 1ULL << 42;
+  case Attribute::InOut:             return 1ULL << 43;
   }
   llvm_unreachable("Unsupported attribute type");
 }
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 2961a966afb63e48fa80e241e853d31264abe6b2..fafc892ea7b49139e0846934dc257e035a1ca950 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -26,6 +26,7 @@
 #include "llvm/Linker.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/IR/Attributes.h"
 
 #include <sstream>
 
@@ -37,6 +38,7 @@ using namespace builddfg;
 namespace {
 
 // Helper function declarations
+  static bool hasAttribute(Function*, unsigned, Attribute::AttrKind);
   static std::string getPTXFilename(const Module&);
   static std::string getFilenameFromModule(const Module& M);
   static void changeDataLayout(Module &);
@@ -66,6 +68,18 @@ namespace {
 
   };
 
+  // Helper class to maintain the tuple of host pointer, device pointer and size
+  // in bytes. Would have preferred to use tuple but support not yet available
+  class OutputPtr {
+  public:
+    OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
+      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+
+    Value* h_ptr;
+    Value* d_ptr;
+    Value* bytes;
+  };
+
   // Visitor for Code generation traversal (tree traversal for now)
   class Kernel {
   public:
@@ -107,9 +121,8 @@ namespace {
     Function* llvm_visc_ptx_launch;
     Function* llvm_visc_ptx_wait;
     Function* llvm_visc_ptx_initContext;
-    Function* llvm_visc_ptx_input_scalar;
-    Function* llvm_visc_ptx_input_ptr;
-    Function* llvm_visc_ptx_output_ptr;
+    Function* llvm_visc_ptx_argument_scalar;
+    Function* llvm_visc_ptx_argument_ptr;
     Function* llvm_visc_ptx_getOutput;
     Function* llvm_visc_ptx_executeNode;
 
@@ -227,17 +240,13 @@ namespace {
         runtimeModule->getFunction("llvm_visc_ptx_initContext")->getFunctionType()));
     DEBUG(errs() << *llvm_visc_ptx_initContext);
 
-    llvm_visc_ptx_input_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_input_scalar",
-        runtimeModule->getFunction("llvm_visc_ptx_input_scalar")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_input_scalar);
+    llvm_visc_ptx_argument_scalar = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_scalar",
+        runtimeModule->getFunction("llvm_visc_ptx_argument_scalar")->getFunctionType()));
+    DEBUG(errs() << *llvm_visc_ptx_argument_scalar);
     
-    llvm_visc_ptx_input_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_input_ptr",
-        runtimeModule->getFunction("llvm_visc_ptx_input_ptr")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_input_ptr);
-
-    llvm_visc_ptx_output_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_output_ptr",
-        runtimeModule->getFunction("llvm_visc_ptx_output_ptr")->getFunctionType()));
-    DEBUG(errs() << *llvm_visc_ptx_output_ptr);
+    llvm_visc_ptx_argument_ptr = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_argument_ptr",
+        runtimeModule->getFunction("llvm_visc_ptx_argument_ptr")->getFunctionType()));
+    DEBUG(errs() << *llvm_visc_ptx_argument_ptr);
 
     llvm_visc_ptx_getOutput = cast<Function>(M.getOrInsertFunction("llvm_visc_ptx_getOutput",
       runtimeModule->getFunction("llvm_visc_ptx_getOutput")->getFunctionType()));
@@ -302,7 +311,7 @@ namespace {
     // argument from argument list of this internal node
     Value* inputVal;
     if(SrcDF->isEntryNode()) {
-      inputVal = getArgumentAt(ParentF_X86, i);
+      inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
       DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
     }
     else {
@@ -348,6 +357,10 @@ namespace {
     // function before.
     assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
+    // Useful values
+    Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+    Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+
     // If kernel struct has not been initialized with kernel function, then fail
     assert(kernel != NULL && "No kernel found!!");
 
@@ -437,6 +450,8 @@ namespace {
     // Iterate over the required input edges of the node and use the visc-rt API
     // to set inputs
     DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
+
+    std::vector<OutputPtr> OutputPointers;
     for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
 
       Value* inputVal = getInValueAt(C, i, F_X86, RI);
@@ -446,6 +461,19 @@ namespace {
       // type on target machine, but for pointers, the size of data would be the
       // next integer argument
       if(inputVal->getType()->isPointerTy()) {
+        // CheckAttribute
+        Value* isOutput = (hasAttribute(CF, i, Attribute::Out))? True : False;
+        Value* isInput = ((hasAttribute(CF, i, Attribute::Out))
+                          && !(hasAttribute(CF, i, Attribute::In)))? False : True;
+
+        Argument* A = getArgumentAt(CF, i);
+        if(isOutput == True) {
+          errs() << *A << " is an OUTPUT argument\n";
+        }
+        if(isInput == True) {
+          errs() << *A << " is an INPUT argument\n";
+        }
+
         Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
                                                           Type::getInt8PtrTy(M.getContext()),
                                                           inputVal->getName()+".i8ptr",
@@ -457,10 +485,15 @@ namespace {
         Value* setInputArgs[] = {GraphID,
                                 inputValI8Ptr,
                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                                inputSize
+                                inputSize,
+                                isInput,
+                                isOutput
                                 };
-        CallInst::Create(llvm_visc_ptx_input_ptr,
-              ArrayRef<Value*>(setInputArgs, 4), "", RI);
+        Value* d_ptr = CallInst::Create(llvm_visc_ptx_argument_ptr,
+              ArrayRef<Value*>(setInputArgs, 6), "", RI);
+        // If this has out attribute, store the returned device pointer in
+        // memory to read device memory later
+        if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
       }
       else { // Scalar Input
         // Store the scalar value on stack and then pass the pointer to its
@@ -478,7 +511,7 @@ namespace {
                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
                                 ConstantExpr::getSizeOf(inputVal->getType())
                                 };
-        CallInst::Create(llvm_visc_ptx_input_scalar,
+        CallInst::Create(llvm_visc_ptx_argument_scalar,
               ArrayRef<Value*>(setInputArgs, 4), "", RI);
       }
 
@@ -499,11 +532,14 @@ namespace {
     unsigned outputIndex = CF->getFunctionType()->getNumParams();
     Value* outputSize = ConstantExpr::getSizeOf(OutputTy);
     Value* setOutputArgs[] = {GraphID,
+                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
                               ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                              ConstantExpr::getSizeOf(OutputTy)};
+                              ConstantExpr::getSizeOf(OutputTy),
+                              False,
+                              True};
 
-    CallInst* d_Output = CallInst::Create(llvm_visc_ptx_output_ptr,
-                                        ArrayRef<Value*>(setOutputArgs,3),
+    CallInst* d_Output = CallInst::Create(llvm_visc_ptx_argument_ptr,
+                                        ArrayRef<Value*>(setOutputArgs, 6),
                                         "d_output."+CF->getName(),
                                         RI);
 
@@ -558,17 +594,30 @@ namespace {
                      RI);
     // Read Output Struct
     Value* GetOutputArgs[] = {GraphID,
+                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
                               d_Output,
                               outputSize};
     CallInst* h_Output = CallInst::Create(llvm_visc_ptx_getOutput,
-                                          ArrayRef<Value*>(GetOutputArgs, 3),
+                                          ArrayRef<Value*>(GetOutputArgs, 4),
                                           "h_output."+CF->getName()+".addr",
                                           RI);
     // Read each device pointer listed in output struct
     // Load the output struct
     CastInst* BI = BitCastInst::CreatePointerCast(h_Output, CF->getReturnType()->getPointerTo(), "output.ptr", RI);
     Value* KernelOutput = new LoadInst(BI, "", RI);
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
+
+    // Read all the pointer arguments which had side effects i.e., had out
+    // attribute
+    for(auto output: OutputPointers) {
+      errs() << "Read: " << *output.d_ptr << "\n";
+      errs() << "\t To: " << *output.h_ptr << "\n";
+      errs() << "\t #bytes: " << *output.bytes << "\n";
+      Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
+      CallInst* CI = CallInst::Create(llvm_visc_ptx_getOutput,
+                                      ArrayRef<Value*>(GetOutputArgs, 4),
+                                      "", RI);
+    }
+    /*for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
       Type* elemTy = OutputTy->getElementType(i);
       if(elemTy->isPointerTy()) {
         // Pointer type
@@ -594,7 +643,8 @@ namespace {
         KernelOutput = InsertValueInst::Create(KernelOutput, h_ptr, ArrayRef<unsigned>(i), "", RI);
 
       }
-    }
+    }*/
+
     // Prepare output
     KernelOutput->setName("output."+CF->getName());
     OutputMap[C] = KernelOutput;
@@ -1239,6 +1289,10 @@ namespace {
  *                              Helper functions                              *
  ******************************************************************************/
 
+  // Find if argument has the given attribute
+  static bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) {
+    return F->getAttributes().hasAttribute(arg_index+1, AK);
+  }
   // Get generated PTX binary name
   static std::string getPTXFilename(const Module& M) {
     std::string moduleID = M.getModuleIdentifier();
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index 3cd64258af529c11937c3662ae3e7c8616cac9c1..80bc859e8049b557a2f2f1553b5fda5bc9b1de9e 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -3,6 +3,7 @@
 #include <cstdio>
 #include <string>
 #include <CL/cl.h>
+//#include "visc-rt.h"
 
 typedef struct {
   pthread_t threadID; 
@@ -35,8 +36,10 @@ void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) {
 
 extern "C"
 void llvm_visc_x86_wait(void* graphID) {
+  printf("Waiting for pthread to finish ...\n");
   DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
   pthread_join(Context->threadID, NULL);
+  printf("\t... pthread Done!\n");
 }
 
 extern "C"
@@ -82,49 +85,42 @@ void llvm_visc_ptx_clearContext() {
 }
 
 extern "C"
-void llvm_visc_ptx_input_scalar(void* graphID, void* input, int arg_index, size_t size) {
-  printf("Set Scalar Input. Argument Index = %d\n", arg_index);
+void llvm_visc_ptx_argument_scalar(void* graphID, void* input, int arg_index, size_t size) {
+  printf("Set Scalar Input. Argument Index = %d, Size = %lu\n", arg_index, size);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
   checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
 }
 
-cl_mem C;
 extern "C"
-void* llvm_visc_ptx_input_ptr(void* graphID, void* input, int arg_index, size_t size) {
-  printf("Set Pointer Input. Argument Index = %d\n", arg_index);
+void* llvm_visc_ptx_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) {
+  printf("Set Pointer Input. Argument Index = %d, Ptr = %p, Size = %lu\n", arg_index, input, size);
+  printf("\tInput: %d, Output: %d\n", isInput, isOutput);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode;
-  cl_mem d_input = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE |
-      CL_MEM_COPY_HOST_PTR, size, input, &errcode);
+  cl_mem_flags clFlags;
+  if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
+  else if(isInput)        clFlags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
+  else if(isOutput)       clFlags = CL_MEM_READ_WRITE;
+  else                    clFlags = CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR;
+
+  if(input == NULL) {
+    printf("Found NULL input value!!");
+    input = NULL;
+  }
+  cl_mem d_input = clCreateBuffer(Context->clGPUContext, clFlags, size, input, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
   errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
-  checkErr(errcode, CL_SUCCESS, "Failure to set pointer input argument");
-  printf("Set Input: DevicePtr = 0x%x, Size = %d\n", d_input, size);
-  C = d_input;
+  checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
+  printf("\tDevicePtr = %p, Size = %lu\n", d_input, size);
   return d_input;
 }
 
 extern "C"
-void* llvm_visc_ptx_output_ptr(void* graphID, int arg_index, size_t size) {
-  printf("Set Pointer Output. Argument Index = %d\n", arg_index);
-  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
-  cl_int errcode;
-  cl_mem d_output = clCreateBuffer(Context->clGPUContext, CL_MEM_READ_WRITE,
-      size, NULL, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
-  errcode |= clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
-  checkErr(errcode, CL_SUCCESS, "Failure to set pointer output argument");
-  printf("Set Output: DevicePtr = 0x%x, Size = %d\n", d_output, size);
-  return d_output;
-}
-int count = 0;
-extern "C"
-void* llvm_visc_ptx_getOutput(void* graphID, void* d_output, size_t size) {
-  if(size > 1000)
-    d_output = (void*) C;
-  printf("Get Output: DevicePtr = 0x%x, Size = %d\n", d_output, size);
-  void* h_output = malloc(size);
+void* llvm_visc_ptx_getOutput(void* graphID, void* h_output, void* d_output, size_t size) {
+  printf("Get Output: HostPtr = %p, DevicePtr = %p, Size = %lu\n", h_output, d_output, size);
+  if(h_output == NULL)
+    h_output = malloc(size);
   DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
   cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
                                 h_output, 0, NULL, NULL);
@@ -136,21 +132,39 @@ extern "C"
 void* llvm_visc_ptx_executeNode(void* graphID, unsigned workDim , const size_t*
                                 localWorkSize, const size_t* globalWorkSize) {
 
-  printf("Execute Node\n");
-  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
-  cl_event* event;
-  // -------------- Just for testing ------
-  if(globalWorkSize == NULL) {
-    // TODO: Remove this hack
-    size_t WorkSize[2] = {1024, 1024};
-    cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
-        Context->clKernel, workDim, NULL, WorkSize, localWorkSize, 0, NULL, event);
-    checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
-    return event;
+  size_t GlobalWG[3];
+  size_t LocalWG[3];
+  printf("Execute Node: Number of Dimensions = %u\n", workDim);
+
+  // OpenCL EnqeueNDRangeKernel function results in segementation fault if we
+  // directly use local and global work groups arguments. Hence, allocating it
+  // on stack and copying.
+  printf("Global Work Group: ");
+  for(unsigned i=0; i<workDim; i++) {
+    printf("%lu ", globalWorkSize[i]);
+    GlobalWG[i] = globalWorkSize[i];
+  }
+  printf("\n");
+
+  // OpenCL allows local workgroup to be null.
+  if(localWorkSize != NULL) {
+    printf("Local Work Group: ");
+    for(unsigned i=0; i<workDim; i++) {
+      printf("%lu ", localWorkSize[i]);
+      LocalWG[i] = localWorkSize[i];
+    }
+    printf("\n");
   }
 
+  DFNodeContext_PTX* Context = (DFNodeContext_PTX*) graphID;
+  // TODO: Would like to use event to ensure better scheduling of kernels.
+  // Currently passing the event paratemeter results in seg fault with
+  // clEnqueueNDRangeKernel.
+  cl_event* event;
+  printf("Enqueuing kernel: %p, %p, %d, %lu, %lu\n", Context->clCommandQue, Context->clKernel,  workDim, GlobalWG[0], GlobalWG[1]);
   cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
-      Context->clKernel, workDim, NULL, globalWorkSize, localWorkSize, 0, NULL, event);
+      Context->clKernel, workDim, NULL, GlobalWG, NULL, 0, NULL, NULL);
+  printf("Enqueued kernel\n");
   checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
   return event;
 }
@@ -262,6 +276,8 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
   // get the list of GPU devices associated with context
   errcode = clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, 0,
                               NULL, &dataBytes);
+  checkErr(errcode, CL_SUCCESS, "Failure to get context info length");
+
   cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
   errcode |= clGetContextInfo(Context->clGPUContext, CL_CONTEXT_DEVICES, dataBytes,
                               clDevices, NULL);
@@ -281,9 +297,11 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
                                         &binaryStatus, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
+  printf("Building kernel - %s from file %s\n", KernelName, FileName);
   errcode = clBuildProgram(Context->clProgram, 0, NULL, NULL, NULL, NULL);
   // If build fails, get build log from device
   if(errcode != CL_SUCCESS) {
+    printf("Failure to build program\n");
     size_t len = 0;
     errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0,
         NULL, &len);
@@ -295,10 +313,8 @@ void* llvm_visc_ptx_launch(const char* FileName, const char* KernelName) {
     checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
 
     printf("Device Build Log:\n%s\n", log);
-
-
+    exit(EXIT_FAILURE);
   }
-  checkErr(errcode, CL_SUCCESS, "Failure to build program");
 
   Context->clKernel = clCreateKernel(Context->clProgram, KernelName, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
diff --git a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
index d1c081571b57bc4787eae321d52fa45ee5ab7a88..c76076a8726fcd1ff9fa9d355b1a0153e9c64e16 100644
--- a/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
+++ b/llvm/test/VISC/MatrixMultiplication/visc_gemm_ptx.ll
@@ -12,6 +12,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 @custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
 @hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
+@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1
 @.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
 @.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
 @.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
@@ -135,7 +136,7 @@ declare i32 @printf(i8* nocapture, ...) #1
 
 ; --------------- VISC Intrinsics ---------------
 ; Return Type of VISC Compute Matrix Mul
-%rtype = type {float*, i64}
+%rtype = type {i64}
 %struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }>
 
 ; Function Attrs: nounwind
@@ -182,7 +183,7 @@ declare void @llvm.visc.bind.output(i8*, i32, i32)
 ; ----------------- VISC intrinsics end ------------------
 
 ; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* nocapture %A, i64 %bytes_A, float* nocapture %B, i64 %bytes_B, float* %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
+define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
 entry:
   ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
   
@@ -235,8 +236,7 @@ for.end:                                          ; preds = %for.body, %entry
   store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
   ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
   ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
-  %.fca.0.insert = insertvalue %rtype undef, float* %C, 0
-  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i64 %bytes_C, 1
+  %.fca.1.insert = insertvalue %rtype undef, i64 %bytes_C, 0
   ret %rtype %.fca.1.insert
 }
 
@@ -254,8 +254,7 @@ define %rtype @MatrixMulRoot(float* %h_A, i64 %bytes_A, float* %h_B, i64 %bytes_
   call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
   call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
   ; Bind Outputs
-  call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C
-  call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C
+  call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); bytes_C
   ret %rtype zeroinitializer
 }
 
@@ -373,7 +372,7 @@ randomInit.exit41:                                ; preds = %for.body.i40
   ; -------------------------------- Completed VISC Launch Call --------------------------------
 
   %3 = extractvalue %rtype %out, 0
-  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %3)
+  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
   %tobool = icmp eq i32 %call14, 0
   br i1 %tobool, label %if.else, label %if.then