diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index e5a448afb407a4fe31d7b4b26262083ceaf18d61..e379ca4296f8fe4754152ef6757483622736b049 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -368,7 +368,7 @@ public:
   DFEdge* getInDFEdgeAt(unsigned inPort);
   DFEdge* getOutDFEdgeAt(unsigned outPort);
   std::map<unsigned, unsigned> getInArgMap();
-  std::map<unsigned, Value*> getSharedInArgMap();
+  std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap();
   std::vector<unsigned> getOutArgMap();
   int getAncestorHops(DFNode* N);
   bool hasSideEffects();
@@ -654,14 +654,14 @@ std::map<unsigned, unsigned> DFNode::getInArgMap() {
   return map;
 }
 
-// Only Allocation Nodes
-std::map<unsigned, Value*> DFNode::getSharedInArgMap() {
-  std::map<unsigned, Value*> map;
+// Only Allocation Nodes - only detect relevant indices
+std::map<unsigned, std::pair<Value*, unsigned> > DFNode::getSharedInArgMap() {
+  std::map<unsigned, std::pair<Value*, unsigned> > map;
   for (unsigned i = 0; i < InDFEdges.size(); i++) {
     DFEdge* E = getInDFEdgeAt(i);
     if (!E->getSourceDF()->isAllocationNode())
       continue;
-    map[i] = NULL;
+    map[i] = std::pair<Value *, unsigned>(NULL,0);
   }
   return map;
 }
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 942b79f6861921e7aee6a637b5b6ed3b70948322..cc21b5611d8732eba99d8e3bbbb7bf51a036337e 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -60,12 +60,12 @@ public:
 class Kernel {
 public:
   Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
-           std::map<unsigned, unsigned>(), std::map<unsigned, Value*> _sharedInArgMap =
-           std::map<unsigned, Value*>(), std::vector<unsigned> _outArgMap =
-           std::vector<unsigned>(), unsigned _gridDim = 0, std::vector<Value*>
-         _globalWGSize = std::vector<Value*>(),
-         unsigned _blockDim = 0,
-         std::vector<Value*> _localWGSize = std::vector<Value*>())
+         std::map<unsigned, unsigned>(),
+         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
+         std::map<unsigned, std::pair<Value*, unsigned> >(),
+         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
+         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
     : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
       sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
       globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
@@ -80,9 +80,10 @@ public:
   DFLeafNode* KernelLeafNode;
   std::map<unsigned, unsigned> inArgMap;
   // Map for shared memory arguments
-  std::map<unsigned, Value*> sharedInArgMap;
+  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
   // Fields for (potential) allocation node
   DFLeafNode* AllocationNode;
+  Function* AllocationFunction;
   std::map<unsigned, unsigned> allocInArgMap;  
 
   std::vector<unsigned> outArgMap;
@@ -99,10 +100,10 @@ public:
     inArgMap = map;
   }
 
-  std::map<unsigned, Value*> getSharedInArgMap() {
+  std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap() {
     return sharedInArgMap;
   }
-  void setSharedInArgMap(std::map<unsigned, Value*> map) {
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
     sharedInArgMap = map;
   }
 
@@ -132,7 +133,7 @@ static std::string getFilenameFromModule(const Module& M);
 static void changeDataLayout(Module &);
 static void changeTargetTriple(Module &);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
-
+static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
 
 // DFG2LLVM_NVPTX - The first implementation.
 struct DFG2LLVM_NVPTX : public DFG2LLVM {
@@ -508,51 +509,126 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   }
   DEBUG(errs() << "Setup output edges of node and insert visc api\n");
 
-  std::map<unsigned, Value*> kernelSharedInArgMap = K->getSharedInArgMap();
-
-  for(std::map<unsigned, Value*>::iterator ib = kernelSharedInArgMap.begin(),
-      ie = kernelSharedInArgMap.end(); ib != ie; ++ib) {
-    unsigned i = ib->first;
-    Value* inputVal = ib->second;
+  std::map<unsigned, std::pair<Value*, unsigned> > kernelSharedInArgMap =
+   K->getSharedInArgMap();
 
-    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+  bool constSizes = true;
+  for (std::map<unsigned, std::pair<Value*, unsigned> >::iterator
+       ib = kernelSharedInArgMap.begin(),
+       ie = kernelSharedInArgMap.end(); ib != ie && constSizes; ++ib) {
+    Value* sizeVal = ib->second.first;
+    constSizes = isa<Constant>(sizeVal);
+  }
 
-    // input value has been obtained.
-    // inputVal is a scalar value
-    if (i % 2 == 0) { // Sharem memory ptr argument - scalar at size position
-      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+  if (constSizes) {
+    for (std::map<unsigned, std::pair<Value*, unsigned> >::iterator
+         ib = kernelSharedInArgMap.begin(),
+         ie = kernelSharedInArgMap.end(); ib != ie; ++ib) {
+      unsigned i = ib->first;
+      Value* inputVal = ib->second.first;
+
+      DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+
+      // input value has been obtained.
+      // inputVal is a scalar value
+      if (i % 2 == 0) { // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        assert(isa<Constant>(inputVal) && "Only constant shared memory size is supported");
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())),
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                                 inputVal
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+
+      } else { // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(),
+          kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".ptr", RI);
+        StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+
+        Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+                               Type::getInt8PtrTy(M.getContext()),
+                               kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".i8ptr",
+                               RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 inputValI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                                 ConstantExpr::getSizeOf(inputVal->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+    }
+  } else {
 
-      assert(isa<Constant>(inputVal) && "Only constant shared memory size is supported");
+    Function *F_alloc = K->AllocationFunction;
+    StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
+    assert(FAllocRetTy && "Allocation node with no struct return type");
 
-      Value* setInputArgs[] = {GraphID,
-                               ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())),
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               inputVal
-                              };
-      CallInst::Create(llvm_visc_ocl_argument_scalar,
-                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
-
-    } else { // Sharem memory size argument - scalar at address position
-      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-      // Store the scalar value on stack and then pass the pointer to its
-      // location
-      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".ptr", RI);
-      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+    std::vector<Value *> AllocInputArgs;
+    for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
+      AllocInputArgs.push_back(getArgumentAt(F_X86, K->allocInArgMap.at(i)));
+    }
 
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
-                             Type::getInt8PtrTy(M.getContext()),
-                             kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".i8ptr",
-                             RI);
+    CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
+    std::vector<ExtractValueInst *> ExtractValueInstVec;
+    for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
+      ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
+      ExtractValueInstVec.push_back(EI);
+    }
 
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               ConstantExpr::getSizeOf(inputVal->getType())
-                              };
-      CallInst::Create(llvm_visc_ocl_argument_scalar,
-                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+    for (std::map<unsigned, std::pair<Value*, unsigned> >::iterator
+         ib = kernelSharedInArgMap.begin(),
+         ie = kernelSharedInArgMap.end(); ib != ie; ++ib) {
+      unsigned i = ib->first;
+      Value* inputVal = ExtractValueInstVec[ib->second.second/2];
+
+      DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+
+      // input value has been obtained.
+      // inputVal is a scalar value
+      if (i % 2 == 0) { // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())),
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                                 inputVal
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+
+      } else { // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(),
+          kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".ptr", RI);
+        StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+
+        Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+                               Type::getInt8PtrTy(M.getContext()),
+                               kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".i8ptr",
+                               RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 inputValI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                                 ConstantExpr::getSizeOf(inputVal->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
     }
   }
+
   DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
 
   // Set output if struct is not an empty struct
@@ -927,14 +1003,42 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
   // If no allocation node was found, SharedMemArgs is empty
   if (kernel->AllocationNode) {
-    std::map<unsigned, Value*> sharedInMap = kernel->getSharedInArgMap();
+
+    ValueToValueMapTy VMap;
+    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap, true);
+    // Insert the cloned function into the kernels module
+    M.getFunctionList().push_back(F_alloc);
+
+    std::vector<IntrinsicInst *> ViscMallocInstVec;
+    findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
+
+    for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
+      IntrinsicInst *II = ViscMallocInstVec[i];
+      assert(II->hasOneUse() && "visc_malloc result is used more than once");
+      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+      II->eraseFromParent();
+    }
+    kernel->AllocationFunction = F_alloc;
+
+    // This could be used to check that the allocation node has the appropriate
+    // number of fields in its return struct
+/*
+    ReturnInst *RI = ReturnInstVec[0];
+    Value *RetVal = RI->getReturnValue();
+    Type *RetTy = RetVal->getType();
+    StructType *RetStructTy = dyn_cast<StructType>(RetTy);
+    assert(RetStructTy && "Allocation node does not return a struct type");
+    unsigned numFields = RetStructTy->getNumElements();
+*/
+    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
     AllocationNodeProperty* APN =
       (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
     for (auto& AllocPair: APN->getAllocationList()) {
       unsigned destPos = AllocPair.first->getDestPosition();
+      unsigned srcPos = AllocPair.first->getSourcePosition();
       SharedMemArgs.push_back(destPos);
-      sharedInMap[destPos] = AllocPair.second;
-      sharedInMap[destPos+1] = AllocPair.second;
+      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
     }
     kernel->setSharedInArgMap(sharedInMap);
   }
@@ -1583,6 +1687,17 @@ static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVe
   }
 }
 
+// Helper function, populate a vector with all IntrinsicID intrinsics in a function
+static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
+}
+
 } // End of namespace
 
 char DFG2LLVM_NVPTX::ID = 0;