diff --git a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
index c608f1feeb71edf44a75fa3af0ad942b89ef77e8..8282ed2374ad54d9fd0aae98b7f48f85a345115a 100644
--- a/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
@@ -11,6 +11,7 @@
 #define TARGET_PTX 32
 #define GENERIC_ADDRSPACE 0
 #define GLOBAL_ADDRSPACE 1
+#define SHARED_ADDRSPACE 3
 
 #define DEBUG_TYPE "DFG2LLVM_SPIR"
 #include "llvm/IR/DataLayout.h"
@@ -60,15 +61,16 @@ public:
 // calls
 class Kernel {
 public:
-  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::vector<unsigned> _inArgMap =
-           std::vector<unsigned>(), std::vector<unsigned> _outArgMap =
-           std::vector<unsigned>(), unsigned _gridDim = 0, std::vector<Value*>
-         _globalWGSize = std::vector<Value*>(),
-         unsigned _blockDim = 0,
-         std::vector<Value*> _localWGSize = std::vector<Value*>())
+  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
+         std::map<unsigned, unsigned>(),
+         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
+         std::map<unsigned, std::pair<Value*, unsigned> >(),
+         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
+         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
     : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-      outArgMap(_outArgMap), gridDim(_gridDim), globalWGSize(_globalWGSize),
-      blockDim(_blockDim), localWGSize(_localWGSize) {
+      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
+      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
 
     assert(gridDim == globalWGSize.size()
            && "gridDim should be same as the size of vector globalWGSize");
@@ -78,7 +80,14 @@ public:
 
   Function* KernelFunction;
   DFLeafNode* KernelLeafNode;
-  std::vector<unsigned> inArgMap;
+  std::map<unsigned, unsigned> inArgMap;
+  // Map for shared memory arguments
+  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
+  // Fields for (potential) allocation node
+  DFLeafNode* AllocationNode;
+  Function* AllocationFunction;
+  std::map<unsigned, unsigned> allocInArgMap;  
+
   std::vector<unsigned> outArgMap;
   unsigned gridDim;
   std::vector<Value*> globalWGSize;
@@ -86,13 +95,20 @@ public:
   std::vector<Value*> localWGSize;
   std::vector<int> localDimMap;
 
-  std::vector<unsigned> getInArgMap() {
+  std::map<unsigned, unsigned> getInArgMap() {
     return inArgMap;
   }
-  void setInArgMap(std::vector<unsigned> map) {
+  void setInArgMap(std::map<unsigned, unsigned> map) {
     inArgMap = map;
   }
 
+  std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap() {
+    return sharedInArgMap;
+  }
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
+    sharedInArgMap = map;
+  }
+
   std::vector<unsigned> getOutArgMap() {
     return outArgMap;
   }
@@ -121,6 +137,9 @@ static void changeTargetTriple(Module &);
 static std::string printType(Type*);
 static StringRef getMangledName(std::string);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
+static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
+static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
+static std::string getAtomicOpName(Intrinsic::ID);
 
 
 // DFG2LLVM_SPIR - The first implementation.
@@ -148,6 +167,7 @@ private:
   Constant* llvm_visc_ocl_wait;
   Constant* llvm_visc_ocl_initContext;
   Constant* llvm_visc_ocl_clearContext;
+  Constant* llvm_visc_ocl_argument_shared;
   Constant* llvm_visc_ocl_argument_scalar;
   Constant* llvm_visc_ocl_argument_ptr;
   Constant* llvm_visc_ocl_output_ptr;
@@ -159,6 +179,7 @@ private:
   std::string getKernelsModuleName(Module &M);
   void fixValueAddrspace(Value* V, unsigned addrspace);
   void changeArgAddrspace(Function* F, unsigned i);
+  void changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i);
   void addCLMetadata(Function* F);
   void transformFunctionToVoid(Function* F);
   void removeInOutAttributes(Function* F);
@@ -249,6 +270,7 @@ void CGT_SPIR::initRuntimeAPI() {
   DECLARE(llvm_visc_ocl_wait);
   DECLARE(llvm_visc_ocl_initContext);
   DECLARE(llvm_visc_ocl_clearContext);
+  DECLARE(llvm_visc_ocl_argument_shared);
   DECLARE(llvm_visc_ocl_argument_scalar);
   DECLARE(llvm_visc_ocl_argument_ptr);
   DECLARE(llvm_visc_ocl_output_ptr);
@@ -409,11 +431,20 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
   // Vector to hold the device memory object that need to be cleared before we release
   // context
   std::vector<Value*> DevicePointers;
+
+  std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap();
+/*
   for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
 
     // The kernel object gives us the mapping of arguments from kernel launch
     // node function (F_X86) to kernel (kernel->KF)
     Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]);
+
+*/
+  for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(),
+      ie = kernelInArgMap.end(); ib != ie; ++ib) {
+    unsigned i = ib->first;
+    Value* inputVal = getArgumentAt(F_X86, ib->second);
     DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
 
     // input value has been obtained.
@@ -443,7 +474,12 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
                              Type::getInt8PtrTy(M.getContext()),
                              inputVal->getName()+".i8ptr",
                              RI);
-      Value* inputSize = getArgumentAt(F_X86, K->getInArgMap()[i+1]);
+
+      // Assert that the pointer argument size (next argument) is in the map
+      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
+
+      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
+
       assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
              && "Pointer type input must always be followed by size (integer type)");
       Value* setInputArgs[] = {GraphID,
@@ -481,8 +517,122 @@ void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fil
       CallInst::Create(llvm_visc_ocl_argument_scalar,
                        ArrayRef<Value*>(setInputArgs, 4), "", RI);
     }
+  }
+
+  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
 
+  // Check to see if all the allocation sizes are constant (determined
+  // statically)
+  bool constSizes = true;
+  for (auto& e: K->getSharedInArgMap()) {
+    constSizes &= isa<Constant>(e.second.first);
   }
+
+  // If the sizes are all constant
+  if (constSizes) {
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = e.second.first;
+
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+                                        allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                               Type::getInt8PtrTy(M.getContext()),
+                               allocSize->getName()+".sharedMem.i8ptr",
+                               RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+    }
+  } else {
+
+    Function *F_alloc = K->AllocationFunction;
+    StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
+    assert(FAllocRetTy && "Allocation node with no struct return type");
+
+    std::vector<Value *> AllocInputArgs;
+    for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
+      AllocInputArgs.push_back(getArgumentAt(F_X86, K->allocInArgMap.at(i)));
+    }
+
+    CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
+    std::vector<ExtractValueInst *> ExtractValueInstVec;
+    for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
+      ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
+      ExtractValueInstVec.push_back(EI);
+    }
+
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = ExtractValueInstVec[e.second.second/2];
+
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
+        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
+        // Store the scalar value on stack and then pass the pointer to its
+        // location
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+                                        allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
+                               Type::getInt8PtrTy(M.getContext()),
+                               allocSize->getName()+".sharedMem.i8ptr",
+                               RI);
+
+        Value* setInputArgs[] = {GraphID,
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
+                                };
+        CallInst::Create(llvm_visc_ocl_argument_scalar,
+                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+      }
+    }
+  }
+
+
   DEBUG(errs() << "Setup output edges of node and insert visc api\n");
 
   // Set output if struct is not an empty struct
@@ -682,14 +832,17 @@ void CGT_SPIR::codeGen(DFInternalNode* N) {
   } else {
     DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
     // Keep track of the arguments order.
-    std::vector<unsigned> inmap1 = N->getInArgMap();
-    std::vector<unsigned> inmap2 = kernel->getInArgMap();
-    // TODO: Verify when we have incoming edges from more than one nodes
-    // The limit is the size of inmap2, because this is the number of kernel arguments
-    for (unsigned i = 0; i < inmap2.size(); i++) {
-      inmap2[i] = inmap1[inmap2[i]];
+    std::map<unsigned, unsigned> inmap1 = N->getInArgMap();
+    std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap();
+    // TODO: Structure assumed: one thread node, one allocation node (at most),
+    // TB node
+    std::map<unsigned, unsigned> inmapFinal;
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
+         ib != ie; ++ib) {
+      inmapFinal[ib->first] = inmap1[ib->second];
     }
-    kernel->setInArgMap(inmap2);
+
+    kernel->setInArgMap(inmapFinal);
 
     // Keep track of the output arguments order.
     std::vector<unsigned> outmap1 = N->getOutArgMap();
@@ -723,6 +876,10 @@ void CGT_SPIR::codeGen(DFInternalNode* N) {
         // find the source location in Parent of N. Retrieve the argument from
         // parent to insert in the vector.
         unsigned argNum = Arg->getArgNo();
+        // This argument will be coming from the parent node, not the allocation
+        // Node
+        assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
+
         unsigned parentArgNum = N->getInArgMap()[argNum];
         Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
         localWGSizeMapped.push_back(A);
@@ -770,6 +927,12 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
     return;
   }
 
+  // Skip code generation if it is an allocation node
+  if(N->isAllocationNode()) {
+    DEBUG(errs() << "Skipping allocation node\n");
+    return;
+  }
+
   // Generate code only if it has the right hint
   if(!checkPreferredTarget(N, visc::SPIR_TARGET)) {
     errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
@@ -791,6 +954,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
     kernel = new Kernel(NULL,
                         N,
                         N->getInArgMap(),
+                        N->getSharedInArgMap(),
                         N->getOutArgMap(),
                         N->getNumOfDim(),
                         N->getDimLimits());
@@ -804,6 +968,7 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
     kernel = new Kernel(NULL,                 // kernel function
                         N,                    // kernel leaf node
                         N->getInArgMap(),     // kenel argument mapping
+                        N->getSharedInArgMap(),
                         N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
                         PNode->getNumOfDim(), // gridDim
                         PNode->getDimLimits(),// grid size
@@ -846,6 +1011,93 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
   transformFunctionToVoid(F_spir);
   removeInOutAttributes(F_spir);
 
+  //FIXME: For now, assume only one allocation node
+  kernel->AllocationNode = NULL;
+
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
+       ieb != iee; ++ieb) {
+    DFNode *SrcDFNode = (*ieb)->getSourceDF();
+    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
+    if (!SrcDFNode->isDummyNode()) {
+      assert(SrcDFNode->isAllocationNode());
+      kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
+      kernel->allocInArgMap = SrcDFNode->getInArgMap();
+      break;
+    }
+  }
+
+  // Vector for shared memory arguments
+  std::vector<unsigned> SharedMemArgs;
+
+  // If no allocation node was found, SharedMemArgs is empty
+  if (kernel->AllocationNode) {
+
+    ValueToValueMapTy VMap;
+    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap, true);
+    // Insert the cloned function into the kernels module
+    M.getFunctionList().push_back(F_alloc);
+
+    std::vector<IntrinsicInst *> ViscMallocInstVec;
+    findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
+
+    for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
+      IntrinsicInst *II = ViscMallocInstVec[i];
+      assert(II->hasOneUse() && "visc_malloc result is used more than once");
+      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+      II->eraseFromParent();
+    }
+    kernel->AllocationFunction = F_alloc;
+
+    // This could be used to check that the allocation node has the appropriate
+    // number of fields in its return struct
+/*
+    ReturnInst *RI = ReturnInstVec[0];
+    Value *RetVal = RI->getReturnValue();
+    Type *RetTy = RetVal->getType();
+    StructType *RetStructTy = dyn_cast<StructType>(RetTy);
+    assert(RetStructTy && "Allocation node does not return a struct type");
+    unsigned numFields = RetStructTy->getNumElements();
+*/
+    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
+    AllocationNodeProperty* APN =
+      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
+    for (auto& AllocPair: APN->getAllocationList()) {
+      unsigned destPos = AllocPair.first->getDestPosition();
+      unsigned srcPos = AllocPair.first->getSourcePosition();
+      SharedMemArgs.push_back(destPos);
+      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+    }
+    kernel->setSharedInArgMap(sharedInMap);
+  }
+  std::sort(SharedMemArgs.begin(), SharedMemArgs.end());
+
+  // All pointer args which are not shared memory pointers have to be moved to
+  // global address space
+  unsigned argIndex = 0;
+  std::vector<unsigned> GlobalMemArgs;
+  for(auto& Arg: F_spir->getArgumentList()) {
+    if (Arg.getType()->isPointerTy()) {
+      // If the arguement is already chosen for shared memory arguemnt list, skip.
+      // Else put it in Global memory arguement list
+      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
+        GlobalMemArgs.push_back(argIndex);
+      }
+    }
+    argIndex++;
+  }
+  std::sort(GlobalMemArgs.begin(), GlobalMemArgs.end());
+
+  /* At this point, we assume that chescks for the fact that SharedMemArgs only
+     contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the
+     analysis pass */
+
+  changeArgAddrspace(F_spir, SharedMemArgs, SHARED_ADDRSPACE);
+  changeArgAddrspace(F_spir, GlobalMemArgs, GLOBAL_ADDRSPACE);
+
+
   // Go through all the instructions
   for (inst_iterator i = inst_begin(F_spir), e = inst_end(F_spir); i != e; ++i) {
     Instruction *I = &(*i);
@@ -1047,6 +1299,79 @@ void CGT_SPIR::codeGen(DFLeafNode* N) {
         IItoRemove.push_back(II);
       }
       break;
+      case Intrinsic::visc_barrier:
+      {
+        errs() << F_spir->getName() << "\t: Handling barrier\n";
+        errs() << "Substitute with barrier()\n";
+        errs() << *II << "\n";
+        FunctionType* FT = FunctionType::get(Type::getVoidTy(getGlobalContext() /*KernelM.getContext()*/),
+                              std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
+                              false);
+        Function* OpenCLFunction = cast<Function>
+                           (KernelM.getOrInsertFunction(getMangledName("barrier"), FT));
+        CallInst* CI = CallInst::Create(OpenCLFunction,
+                               ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(getGlobalContext()), 1)),
+                               "", II);
+        II->replaceAllUsesWith(CI);
+        IItoRemove.push_back(II); 
+      }
+      break;
+      case Intrinsic::visc_atomic_cmpxchg:
+      break;
+      case Intrinsic::visc_atomic_add:
+      case Intrinsic::visc_atomic_sub:
+      case Intrinsic::visc_atomic_xchg:
+      case Intrinsic::visc_atomic_min:
+      case Intrinsic::visc_atomic_max:
+      case Intrinsic::visc_atomic_and:
+      case Intrinsic::visc_atomic_or:
+      case Intrinsic::visc_atomic_xor:
+      //case Intrinsic::visc_atomic_inc:
+      //case Intrinsic::visc_atomic_dec:
+      {
+        errs() << *II << "\n";
+        // Only have support for i32 atomic intrinsics
+        assert(II->getType() == Type::getInt32Ty(II->getContext())
+            && "Only support i32 atomic intrinsics for now");
+        // Store the argument types and operand values in vectors
+        //std::vector<Type*> ArgTypes;
+        //std::vector<Value*> ArgValues;
+        //for(unsigned i=0; i < II->getNumArgOperands(); i++) {
+          //Value* V = II->getArgOperand(i);
+          //if(V->getType()->isPointerTy()) {
+            //If it is a pointer type, then bit cast to i32* as intrinsics use
+            //i8* for all pointers
+            //V = CastInst::CreatePointerCast(V, Type::getInt32PtrTy(II->getContext()), "", II);
+            //errs() << *V << "\n";
+          //}
+          //ArgTypes.push_back(V->getType());
+          //ArgValues.push_back(V);
+        //}
+        // Substitute with atomicrmw instruction
+        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
+        Value* Ptr = II->getArgOperand(0);
+        Value* Val = II->getArgOperand(1);
+        assert(Ptr->getType()->isPointerTy()
+            && "First argument of supported atomics is expected to be a pointer");
+        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
+        if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) {
+          Ptr = CastInst::CreatePointerCast(Ptr, Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()), "", II);
+        }
+        AtomicRMWInst* AtomicInst = new AtomicRMWInst(getAtomicOp(II->getIntrinsicID()),
+            Ptr, Val, llvm::SequentiallyConsistent, llvm::CrossThread, II);
+        AtomicInst->setVolatile(true);
+        // Create OpenCL function call
+        //FunctionType* FT = FunctionType::get(Type::getInt32Ty(getGlobalContext()),
+            //ArgTypes, false);
+        //Function* OpenCLFunction = cast<Function>(KernelM.getOrInsertFunction(
+              //StringRef(getAtomicOpName(II->getIntrinsicID())), FT));
+        //CallInst* CI = CallInst::Create(OpenCLFunction, ArgValues, II->getName(), II);
+        //errs() << "Substitute with: " << *CI << "\n";
+        errs() << "Substitute with: " << *AtomicInst << "\n";
+        II->replaceAllUsesWith(AtomicInst);
+        IItoRemove.push_back(II);
+      }
+      break;
       default:
         assert(false && "Unknown VISC Intrinsic!");
         break;
@@ -1161,6 +1486,25 @@ void CGT_SPIR::changeArgAddrspace(Function* F, unsigned addrspace) {
   DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n");
 }
 
+void CGT_SPIR::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
+  unsigned idx = 0;
+  std::vector<Type*> ArgTypes;
+  for(auto& arg: F->getArgumentList()) {
+    DEBUG(errs() << arg << "\n");
+    unsigned argno = arg.getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(&arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg.getType());
+  }
+  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, false);
+  PointerType* PTy = FTy->getPointerTo(cast<PointerType>(F->getType())->getAddressSpace());
+
+  F->mutateType(PTy);
+  DEBUG(errs() << *F->getFunctionType() << "\n" <<*F << "\n");
+}
+
 /* Add metadata to module KernelM, for OpenCL kernels */
 void CGT_SPIR::addCLMetadata(Function *F) {
   // TODO: There is additional metadata used by kernel files but we skip them as
@@ -1474,6 +1818,55 @@ static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVe
   }
 }
 
+// Helper function, populate a vector with all IntrinsicID intrinsics in a function
+static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
+}
+
+// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
+static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
+  switch(ID) {
+    case Intrinsic::visc_atomic_add: return AtomicRMWInst::Add;
+    case Intrinsic::visc_atomic_sub: return AtomicRMWInst::Sub;
+    case Intrinsic::visc_atomic_min: return AtomicRMWInst::Min;
+    case Intrinsic::visc_atomic_max: return AtomicRMWInst::Max;
+    //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
+    //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
+    case Intrinsic::visc_atomic_xchg: return AtomicRMWInst::Xchg;
+    case Intrinsic::visc_atomic_and: return AtomicRMWInst::And;
+    case Intrinsic::visc_atomic_or: return AtomicRMWInst::Or;
+    case Intrinsic::visc_atomic_xor: return AtomicRMWInst::Xor;
+    default:
+      llvm_unreachable("Unsupported atomic intrinsic!");
+  };
+}
+
+// Helper funtion, returns the OpenCL function name, corresponding to atomic op
+static std::string getAtomicOpName(Intrinsic::ID ID) {
+  switch(ID) {
+    case Intrinsic::visc_atomic_cmpxchg: return "atom_cmpxchg";
+    case Intrinsic::visc_atomic_add: return "atom_add";
+    case Intrinsic::visc_atomic_sub: return "atom_sub";
+    case Intrinsic::visc_atomic_min: return "atom_min";
+    case Intrinsic::visc_atomic_max: return "atom_max";
+    case Intrinsic::visc_atomic_inc: return "atom_inc";
+    case Intrinsic::visc_atomic_dec: return "atom_dec";
+    case Intrinsic::visc_atomic_xchg: return "atom_xchg";
+    case Intrinsic::visc_atomic_and: return "atom_and";
+    case Intrinsic::visc_atomic_or: return "atom_or";
+    case Intrinsic::visc_atomic_xor: return "atom_xor";
+    default:
+      llvm_unreachable("Unsupported atomic intrinsic!");
+  };
+}
+
+
 } // End of namespace
 
 char DFG2LLVM_SPIR::ID = 0;