diff --git a/llvm/include/llvm/IR/DFGraph.h b/llvm/include/llvm/IR/DFGraph.h
index 11473cec41f0df45b14b0bc6ab4b480c08a5aedc..ab17261f19d23228e7aaf2903a19e010249e0146 100644
--- a/llvm/include/llvm/IR/DFGraph.h
+++ b/llvm/include/llvm/IR/DFGraph.h
@@ -373,7 +373,8 @@ public:
   bool isExitNode();
   DFEdge* getInDFEdgeAt(unsigned inPort);
   DFEdge* getOutDFEdgeAt(unsigned outPort);
-  std::vector<unsigned> getInArgMap();
+  std::map<unsigned, unsigned> getInArgMap();
+  std::map<unsigned, Value*> getSharedInArgMap();
   std::vector<unsigned> getOutArgMap();
   int getAncestorHops(DFNode* N);
   bool hasSideEffects();
@@ -652,16 +653,31 @@ DFEdge* DFNode::getOutDFEdgeAt(unsigned outPort) {
   return NULL;
 }
 
-std::vector<unsigned> DFNode::getInArgMap() {
-  std::vector<unsigned> map(InDFEdges.size());
+// Ignore Allocation Nodes
+std::map<unsigned, unsigned> DFNode::getInArgMap() {
+  std::map<unsigned, unsigned> map;
   for (unsigned i = 0; i < InDFEdges.size(); i++) {
     DFEdge* E = getInDFEdgeAt(i);
+    if (E->getSourceDF()->isAllocationNode())
+      continue;
     unsigned pos = E->getSourcePosition();
     map[i] = pos;
   }
   return map;
 }
 
+// Only Allocation Nodes
+std::map<unsigned, Value*> DFNode::getSharedInArgMap() {
+  std::map<unsigned, Value*> map;
+  for (unsigned i = 0; i < InDFEdges.size(); i++) {
+    DFEdge* E = getInDFEdgeAt(i);
+    if (!E->getSourceDF()->isAllocationNode())
+      continue;
+    map[i] = NULL;
+  }
+  return map;
+}
+
 std::vector<unsigned> DFNode::getOutArgMap() {
   std::vector<unsigned> map(OutDFEdges.size());
   for (unsigned i = 0; i < OutDFEdges.size(); i++) {
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 525367fa05853010fa29f85b476bd44211a0899b..b3ea4c6216776bc380c0e46e3d1c35e3ad63aee3 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -11,6 +11,7 @@
 #define TARGET_PTX 32
 #define GENERIC_ADDRSPACE 0
 #define GLOBAL_ADDRSPACE 1
+#define SHARED_ADDRSPACE 3
 
 #define DEBUG_TYPE "DFG2LLVM_NVPTX"
 #include "llvm/IR/DataLayout.h"
@@ -58,15 +59,16 @@ public:
 // calls
 class Kernel {
 public:
-  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::vector<unsigned> _inArgMap =
-           std::vector<unsigned>(), std::vector<unsigned> _outArgMap =
+  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
+           std::map<unsigned, unsigned>(), std::map<unsigned, Value*> _sharedInArgMap =
+           std::map<unsigned, Value*>(), std::vector<unsigned> _outArgMap =
            std::vector<unsigned>(), unsigned _gridDim = 0, std::vector<Value*>
          _globalWGSize = std::vector<Value*>(),
          unsigned _blockDim = 0,
          std::vector<Value*> _localWGSize = std::vector<Value*>())
     : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-      outArgMap(_outArgMap), gridDim(_gridDim), globalWGSize(_globalWGSize),
-      blockDim(_blockDim), localWGSize(_localWGSize) {
+      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
+      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
 
     assert(gridDim == globalWGSize.size()
            && "gridDim should be same as the size of vector globalWGSize");
@@ -76,7 +78,13 @@ public:
 
   Function* KernelFunction;
   DFLeafNode* KernelLeafNode;
-  std::vector<unsigned> inArgMap;
+  std::map<unsigned, unsigned> inArgMap;
+  // Map for shared memory arguments
+  std::map<unsigned, Value*> sharedInArgMap;
+  // Fields for (potential) allocation node
+  DFLeafNode* AllocationNode;
+  std::map<unsigned, unsigned> allocInArgMap;  
+
   std::vector<unsigned> outArgMap;
   unsigned gridDim;
   std::vector<Value*> globalWGSize;
@@ -84,13 +92,20 @@ public:
   std::vector<Value*> localWGSize;
   std::vector<int> localDimMap;
 
-  std::vector<unsigned> getInArgMap() {
+  std::map<unsigned, unsigned> getInArgMap() {
     return inArgMap;
   }
-  void setInArgMap(std::vector<unsigned> map) {
+  void setInArgMap(std::map<unsigned, unsigned> map) {
     inArgMap = map;
   }
 
+  std::map<unsigned, Value*> getSharedInArgMap() {
+    return sharedInArgMap;
+  }
+  void setSharedInArgMap(std::map<unsigned, Value*> map) {
+    sharedInArgMap = map;
+  }
+
   std::vector<unsigned> getOutArgMap() {
     return outArgMap;
   }
@@ -138,7 +153,7 @@ private:
   Module &KernelM;
   DFNode* KernelLaunchNode;
   Kernel* kernel;
-
+  
   // VISC Runtime API
   Constant* llvm_visc_ocl_launch;
   Constant* llvm_visc_ocl_wait;
@@ -155,6 +170,7 @@ private:
   std::string getKernelsModuleName(Module &M);
   void fixValueAddrspace(Value* V, unsigned addrspace);
   void changeArgAddrspace(Function* F, unsigned i);
+  void changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i);
   void addCLMetadata(Function* F);
   void transformFunctionToVoid(Function* F);
   void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
@@ -403,11 +419,21 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   // Vector to hold the device memory object that need to be cleared before we release
   // context
   std::vector<Value*> DevicePointers;
+
+  std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap();
+/*
   for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
 
     // The kernel object gives us the mapping of arguments from kernel launch
     // node function (F_X86) to kernel (kernel->KF)
     Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]);
+
+*/
+
+  for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(),
+      ie = kernelInArgMap.end(); ib != ie; ++ib) {
+    unsigned i = ib->first;
+    Value* inputVal = getArgumentAt(F_X86, ib->second);
     DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
 
     // input value has been obtained.
@@ -437,7 +463,11 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
                              Type::getInt8PtrTy(M.getContext()),
                              inputVal->getName()+".i8ptr",
                              RI);
-      Value* inputSize = getArgumentAt(F_X86, K->getInArgMap()[i+1]);
+
+      // Assert that the pointer argument size (next argument) is in the map
+      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
+
+      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
       assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
              && "Pointer type input must always be followed by size (integer type)");
       Value* setInputArgs[] = {GraphID,
@@ -475,10 +505,56 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       CallInst::Create(llvm_visc_ocl_argument_scalar,
                        ArrayRef<Value*>(setInputArgs, 4), "", RI);
     }
-
   }
   DEBUG(errs() << "Setup output edges of node and insert visc api\n");
 
+  std::map<unsigned, Value*> kernelSharedInArgMap = K->getSharedInArgMap();
+
+  for(std::map<unsigned, Value*>::iterator ib = kernelSharedInArgMap.begin(),
+      ie = kernelSharedInArgMap.end(); ib != ie; ++ib) {
+    unsigned i = ib->first;
+    Value* inputVal = ib->second;
+
+    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+
+    // input value has been obtained.
+    // inputVal is a scalar value
+    if (i % 2 == 0) { // Sharem memory ptr argument - scalar at size position
+      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+      assert(isa<Constant>(inputVal) && "Only constant shared memory size is supported");
+
+      Value* setInputArgs[] = {GraphID,
+                               ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())),
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               inputVal
+                              };
+      CallInst::Create(llvm_visc_ocl_argument_scalar,
+                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+
+    } else { // Sharem memory size argument - scalar at address position
+      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+      // Store the scalar value on stack and then pass the pointer to its
+      // location
+      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".ptr", RI);
+      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+                             Type::getInt8PtrTy(M.getContext()),
+                             kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".i8ptr",
+                             RI);
+
+      Value* setInputArgs[] = {GraphID,
+                               inputValI8Ptr,
+                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
+                               ConstantExpr::getSizeOf(inputVal->getType())
+                              };
+      CallInst::Create(llvm_visc_ocl_argument_scalar,
+                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+    }
+  }
+  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
+
   // Set output if struct is not an empty struct
   StructType* OutputTy = K->KernelLeafNode->getOutputType();
   std::vector<Value*> d_Outputs;
@@ -675,13 +751,17 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
   } else {
     DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
     // Keep track of the arguments order.
-    std::vector<unsigned> inmap1 = N->getInArgMap();
-    std::vector<unsigned> inmap2 = kernel->getInArgMap();
-    // TODO: Verify when we have incoming edges from more than one nodes The
-    // limit is the size of inmap2, because this is the number of kernel
-    // arguments
-    for (unsigned i = 0; i < inmap2.size(); i++) { inmap2[i] =
-      inmap1[inmap2[i]]; } kernel->setInArgMap(inmap2);
+    std::map<unsigned, unsigned> inmap1 = N->getInArgMap();
+    std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap();
+    // TODO: Structure assumed: one thread node, one allocation node (at most),
+    // TB node
+    std::map<unsigned, unsigned> inmapFinal;
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
+         ib != ie; ++ib) {
+      inmapFinal[ib->first] = inmap1[ib->second];
+    }
+
+    kernel->setInArgMap(inmapFinal);
 
     // Keep track of the output arguments order.
     std::vector<unsigned> outmap1 = N->getOutArgMap();
@@ -715,6 +795,10 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
         // find the source location in Parent of N. Retrieve the argument from
         // parent to insert in the vector.
         unsigned argNum = Arg->getArgNo();
+        // This argument will be coming from the parent node, not the allocation
+        // Node
+        assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
+
         unsigned parentArgNum = N->getInArgMap()[argNum];
         Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
         localWGSizeMapped.push_back(A);
@@ -736,13 +820,18 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
+
+  // Skip code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
  
   // Generate code only if it has the right hint
   if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
     errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
     return;
   }
- 
 
   // Checking which node is the kernel launch
   DFNode* PNode = N->getParent();
@@ -758,6 +847,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
     kernel = new Kernel(NULL,
                         N,
                         N->getInArgMap(),
+                        N->getSharedInArgMap(),
                         N->getOutArgMap(),
                         N->getNumOfDim(),
                         N->getDimLimits());
@@ -771,6 +861,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
     kernel = new Kernel(NULL,                 // kernel function
                         N,                    // kernel leaf node
                         N->getInArgMap(),     // kenel argument mapping
+                        N->getSharedInArgMap(),
                         N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
                         PNode->getNumOfDim(), // gridDim
                         PNode->getDimLimits(),// grid size
@@ -812,6 +903,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
   transformFunctionToVoid(F_nvptx);
 
+
+  //FIXME: For now, assume only one allocation node
+  kernel->AllocationNode = NULL;
+
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
+       ieb != iee; ++ieb) {
+    DFNode *SrcDFNode = (*ieb)->getSourceDF();
+    errs() << "Found edge from node: "<< SrcDFNode->getFuncPointer()->getName() << "\n";
+    errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n";
+    if (SrcDFNode != PNode) {
+      assert(SrcDFNode->isAllocationNode());
+      kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
+      kernel->allocInArgMap = SrcDFNode->getInArgMap();
+      break;
+    }
+  }
+
+  // Vector for shared memory arguments
+  std::vector<unsigned> SharedMemArgs;
+
+  // If no allocation node was found, SharedMemArgs is empty
+  if (kernel->AllocationNode) {
+    std::map<unsigned, Value*> sharedInMap = kernel->getSharedInArgMap();
+    AllocationNodeProperty* APN =
+      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
+    for (auto& AllocPair: APN->getAllocationList()) {
+      unsigned destPos = AllocPair.first->getDestPosition();
+      SharedMemArgs.push_back(destPos);
+      sharedInMap[destPos] = AllocPair.second;
+      sharedInMap[destPos+1] = AllocPair.second;
+    }
+    kernel->setSharedInArgMap(sharedInMap);
+  }
+  std::sort(SharedMemArgs.begin(), SharedMemArgs.end());
+
+  /* At this point, we assume that chescks for the fact that SharedMemArgs only
+     contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the
+     analysis pass */
+
+  changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
+
+
   // Go through all the instructions
   for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
     Instruction *I = &(*i);
@@ -1138,6 +1271,25 @@ void CGT_NVPTX::changeArgAddrspace(Function* F, unsigned addrspace) {
   DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n");
 }
 
+void CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
+  unsigned idx = 0;
+  std::vector<Type*> ArgTypes;
+  for(auto& arg: F->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned argno = arg.getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(&arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg.getType());
+  }
+  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, false);
+  PointerType* PTy = FTy->getPointerTo(cast<PointerType>(F->getType())->getAddressSpace());
+
+  F->mutateType(PTy);
+  DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n");
+}
+
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_NVPTX::addCLMetadata(Function *F) {