From 9de3eae28281ab36772e6bf30e931adc0a60b6c0 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <psrivas2@illinois.edu>
Date: Fri, 11 Mar 2016 03:00:28 -0600
Subject: [PATCH] (1) Added atomic visc intrinsics and support for them in PTX
 backend     - IntrinsicsVISC.td, visc.h, GenVISC.cpp (2) Simplified GenVISC
 to easily add support for new intrinsics which have a 1 to 1 mapping
 instruction during code gen (3) Added runtime api call to set ocl shared
 memory argument

---
 llvm/include/llvm/IR/IntrinsicsVISC.td        |  50 ++-
 .../DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp         | 223 +++++++----
 llvm/lib/Transforms/GenVISC/GenVISC.cpp       | 355 ++++++++----------
 llvm/lib/Transforms/LocalMem/LocalMem.cpp     |   2 +-
 llvm/projects/visc-rt/visc-rt.cpp             |  12 +-
 llvm/projects/visc-rt/visc-rt.h               |   1 +
 llvm/test/VISC/parboil/.ycm_extra_conf.py     |   1 +
 llvm/test/VISC/parboil/common/include/visc.h  |  25 ++
 8 files changed, 390 insertions(+), 279 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td
index 7f56304384..cb30dffe52 100644
--- a/llvm/include/llvm/IR/IntrinsicsVISC.td
+++ b/llvm/include/llvm/IR/IntrinsicsVISC.td
@@ -144,8 +144,8 @@ let TargetPrefix = "visc" in {
    * intrinsic -
    * i32 llvm.visc.getNumNodeInstances(i8*, i32);
    */
-//  def int_visc_getNumNodeInstances : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty,
-//                                               llvm_i32_ty], []>;
+  /*def int_visc_getNumNodeInstances : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty,*/
+                                               /*llvm_i32_ty], []>;*/
 
   /* i32 llvm.visc.getNumNodeInstances.[xyz](i8*);
    */
@@ -175,6 +175,52 @@ let TargetPrefix = "visc" in {
   def int_visc_getVectorLength : Intrinsic<[llvm_i32_ty], [], []>;
 
   /* ============ Atomic intrinsics ============= */
+  // Atomic arithmetic operations
+  
+  /* i32 llvm.visc.atomic.cmpxchg(i32*, i32)*/
+  def int_visc_atomic_cmpxchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty,
+                                          llvm_i32_ty], []>;
 
+  /* i32 llvm.visc.atomic.add(i32*, i32)*/
+  def int_visc_atomic_add: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.sub(i32*, i32)*/
+  def int_visc_atomic_sub: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.xchg(i32*, i32)*/
+  def int_visc_atomic_xchg: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.inc(i32*, i32)*/
+  def int_visc_atomic_inc: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.dec(i32*, i32)*/
+  def int_visc_atomic_dec: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.min(i32*, i32)*/
+  def int_visc_atomic_min: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.maxi32*, i32)*/
+  def int_visc_atomic_max: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  // Atomic bitwise operations
+
+  /* i32 llvm.visc.atomic.and(i32*, i32)*/
+  def int_visc_atomic_and: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.or(i32*, i32)*/
+  def int_visc_atomic_or: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
+
+  /* i32 llvm.visc.atomic.xor(i32*, i32)*/
+  def int_visc_atomic_xor: Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty],
+                                    []>;
 
 }
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index be46aec0dd..f3a32a9828 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -134,6 +134,8 @@ static void changeDataLayout(Module &);
 static void changeTargetTriple(Module &);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
 static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
+static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
+static std::string getAtomicOpName(Intrinsic::ID);
 
 // DFG2LLVM_NVPTX - The first implementation.
 struct DFG2LLVM_NVPTX : public DFG2LLVM {
@@ -160,6 +162,7 @@ private:
   Constant* llvm_visc_ocl_wait;
   Constant* llvm_visc_ocl_initContext;
   Constant* llvm_visc_ocl_clearContext;
+  Constant* llvm_visc_ocl_argument_shared;
   Constant* llvm_visc_ocl_argument_scalar;
   Constant* llvm_visc_ocl_argument_ptr;
   Constant* llvm_visc_ocl_output_ptr;
@@ -260,6 +263,7 @@ void CGT_NVPTX::initRuntimeAPI() {
   DECLARE(llvm_visc_ocl_wait);
   DECLARE(llvm_visc_ocl_initContext);
   DECLARE(llvm_visc_ocl_clearContext);
+  DECLARE(llvm_visc_ocl_argument_shared);
   DECLARE(llvm_visc_ocl_argument_scalar);
   DECLARE(llvm_visc_ocl_argument_ptr);
   DECLARE(llvm_visc_ocl_output_ptr);
@@ -507,60 +511,55 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
                        ArrayRef<Value*>(setInputArgs, 4), "", RI);
     }
   }
-  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
 
-  std::map<unsigned, std::pair<Value*, unsigned> > kernelSharedInArgMap =
-   K->getSharedInArgMap();
+  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
 
+  // Check to see if all the allocation sizes are constant (determined
+  // statically)
   bool constSizes = true;
-  for (std::map<unsigned, std::pair<Value*, unsigned> >::iterator
-       ib = kernelSharedInArgMap.begin(),
-       ie = kernelSharedInArgMap.end(); ib != ie && constSizes; ++ib) {
-    Value* sizeVal = ib->second.first;
-    constSizes = isa<Constant>(sizeVal);
+  for (auto& e: K->getSharedInArgMap()) {
+    constSizes &= isa<Constant>(e.second.first);
   }
 
+  // If the sizes are all constant
   if (constSizes) {
-    for (std::map<unsigned, std::pair<Value*, unsigned> >::iterator
-         ib = kernelSharedInArgMap.begin(),
-         ie = kernelSharedInArgMap.end(); ib != ie; ++ib) {
-      unsigned i = ib->first;
-      Value* inputVal = ib->second.first;
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = e.second.first;
 
-      DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
 
-      // input value has been obtained.
-      // inputVal is a scalar value
-      if (i % 2 == 0) { // Shared memory ptr argument - scalar at size position
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
 
-        assert(isa<Constant>(inputVal) && "Only constant shared memory size is supported");
+        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
 
         Value* setInputArgs[] = {GraphID,
-                                 ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())),
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                                 inputVal
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
                                 };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
-
-      } else { // Sharem memory size argument - scalar at address position
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(),
-          kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".ptr", RI);
-        StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+                                        allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
 
-        Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
                                Type::getInt8PtrTy(M.getContext()),
-                               kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".i8ptr",
+                               allocSize->getName()+".sharedMem.i8ptr",
                                RI);
 
         Value* setInputArgs[] = {GraphID,
-                                 inputValI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                                 ConstantExpr::getSizeOf(inputVal->getType())
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
                                 };
         CallInst::Create(llvm_visc_ocl_argument_scalar,
                          ArrayRef<Value*>(setInputArgs, 4), "", RI);
@@ -584,44 +583,41 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       ExtractValueInstVec.push_back(EI);
     }
 
-    for (std::map<unsigned, std::pair<Value*, unsigned> >::iterator
-         ib = kernelSharedInArgMap.begin(),
-         ie = kernelSharedInArgMap.end(); ib != ie; ++ib) {
-      unsigned i = ib->first;
-      Value* inputVal = ExtractValueInstVec[ib->second.second/2];
+    for (auto& e: K->getSharedInArgMap()) {
+      unsigned argNum = e.first;
+      Value* allocSize = ExtractValueInstVec[e.second.second/2];
 
-      DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
 
-      // input value has been obtained.
-      // inputVal is a scalar value
-      if (i % 2 == 0) { // Shared memory ptr argument - scalar at size position
+      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
+        // Shared memory ptr argument - scalar at size position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
 
         Value* setInputArgs[] = {GraphID,
-                                 ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())),
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                                 inputVal
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 allocSize
                                 };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
-
-      } else { // Sharem memory size argument - scalar at address position
+        CallInst::Create(llvm_visc_ocl_argument_shared,
+                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
+      }
+      else {
+        // Sharem memory size argument - scalar at address position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(),
-          kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".ptr", RI);
-        StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
+        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
+                                        allocSize->getName()+".sharedMem.ptr", RI);
+        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
 
-        Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
+        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
                                Type::getInt8PtrTy(M.getContext()),
-                               kernel->KernelFunction->getName()+".sharedMem."+Twine(i)+".i8ptr",
+                               allocSize->getName()+".sharedMem.i8ptr",
                                RI);
 
         Value* setInputArgs[] = {GraphID,
-                                 inputValI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                                 ConstantExpr::getSizeOf(inputVal->getType())
+                                 allocSizeI8Ptr,
+                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
+                                 ConstantExpr::getSizeOf(allocSize->getType())
                                 };
         CallInst::Create(llvm_visc_ocl_argument_scalar,
                          ArrayRef<Value*>(setInputArgs, 4), "", RI);
@@ -629,8 +625,8 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
     }
   }
 
-  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
 
+  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
   // Set output if struct is not an empty struct
   StructType* OutputTy = K->KernelLeafNode->getOutputType();
   std::vector<Value*> d_Outputs;
@@ -911,7 +907,6 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
   // Checking which node is the kernel launch
   DFNode* PNode = N->getParent();
-  errs() << "Parent Node: " << PNode << " " << PNode->getFuncPointer()->getName() << "\n";
   int pLevel = PNode->getLevel();
   int pReplFactor = PNode->getNumOfDim();
 
@@ -986,9 +981,9 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
        ieb != iee; ++ieb) {
     DFNode *SrcDFNode = (*ieb)->getSourceDF();
-    errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n";
-    errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n";
-    errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n";
+    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
     if (!SrcDFNode->isDummyNode()) {
       assert(SrcDFNode->isAllocationNode());
       kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
@@ -1284,18 +1279,72 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         errs() << F_nvptx->getName() << "\t: Handling barrier\n";
         errs() << "Substitute with barrier()\n";
         errs() << *II << "\n";
-          FunctionType* FT =
-            FunctionType::get(Type::getVoidTy(getGlobalContext() /*KernelM.getContext()*/),
+        FunctionType* FT = FunctionType::get(Type::getVoidTy(getGlobalContext() /*KernelM.getContext()*/),
                               std::vector<Type*>(1, Type::getInt32Ty(getGlobalContext() /*KernelM.getContext()*/)),
                               false);
-          Function* OpenCLFunction = cast<Function>
+        Function* OpenCLFunction = cast<Function>
                            (KernelM.getOrInsertFunction(StringRef("barrier"), FT));
         CallInst* CI = CallInst::Create(OpenCLFunction,
                                ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(getGlobalContext()), 1)),
                                "", II);
         II->replaceAllUsesWith(CI);
+        IItoRemove.push_back(II); 
+      }
+      break;
+      case Intrinsic::visc_atomic_cmpxchg:
+      break;
+      case Intrinsic::visc_atomic_add:
+      case Intrinsic::visc_atomic_sub:
+      case Intrinsic::visc_atomic_xchg:
+      case Intrinsic::visc_atomic_min:
+      case Intrinsic::visc_atomic_max:
+      case Intrinsic::visc_atomic_and:
+      case Intrinsic::visc_atomic_or:
+      case Intrinsic::visc_atomic_xor:
+      //case Intrinsic::visc_atomic_inc:
+      //case Intrinsic::visc_atomic_dec:
+      {
+        errs() << *II << "\n";
+        // Only have support for i32 atomic intrinsics
+        assert(II->getType() == Type::getInt32Ty(II->getContext())
+            && "Only support i32 atomic intrinsics for now");
+        // Store the argument types and operand values in vectors
+        //std::vector<Type*> ArgTypes;
+        //std::vector<Value*> ArgValues;
+        //for(unsigned i=0; i < II->getNumArgOperands(); i++) {
+          //Value* V = II->getArgOperand(i);
+          //if(V->getType()->isPointerTy()) {
+            //If it is a pointer type, then bit cast to i32* as intrinsics use
+            //i8* for all pointers
+            //V = CastInst::CreatePointerCast(V, Type::getInt32PtrTy(II->getContext()), "", II);
+            //errs() << *V << "\n";
+          //}
+          //ArgTypes.push_back(V->getType());
+          //ArgValues.push_back(V);
+        //}
+        // Substitute with atomicrmw instruction
+        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
+        Value* Ptr = II->getArgOperand(0);
+        Value* Val = II->getArgOperand(1);
+        assert(Ptr->getType()->isPointerTy()
+            && "First argument of supported atomics is expected to be a pointer");
+        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
+        if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) {
+          Ptr = CastInst::CreatePointerCast(Ptr, Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()), "", II);
+        }
+        AtomicRMWInst* AtomicInst = new AtomicRMWInst(getAtomicOp(II->getIntrinsicID()),
+            Ptr, Val, llvm::SequentiallyConsistent, llvm::CrossThread, II);
+        AtomicInst->setVolatile(true);
+        // Create OpenCL function call
+        //FunctionType* FT = FunctionType::get(Type::getInt32Ty(getGlobalContext()),
+            //ArgTypes, false);
+        //Function* OpenCLFunction = cast<Function>(KernelM.getOrInsertFunction(
+              //StringRef(getAtomicOpName(II->getIntrinsicID())), FT));
+        //CallInst* CI = CallInst::Create(OpenCLFunction, ArgValues, II->getName(), II);
+        //errs() << "Substitute with: " << *CI << "\n";
+        errs() << "Substitute with: " << *AtomicInst << "\n";
+        II->replaceAllUsesWith(AtomicInst);
         IItoRemove.push_back(II);
-       
       }
       break;
       default:
@@ -1326,8 +1375,10 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Traverse the vector backwards, otherwise definitions are deleted while
   // their subsequent uses are still around
   for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
-       re = IItoRemove.rend(); ri != re; ++ri)
+       re = IItoRemove.rend(); ri != re; ++ri) {
+    errs() << "Erasing: " << **ri << "\n";
     (*ri)->eraseFromParent();
+  }
 
   addCLMetadata(F_nvptx);
   kernel->KernelFunction = F_nvptx;
@@ -1714,6 +1765,44 @@ static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vecto
   }
 }
 
+// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
+static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
+  switch(ID) {
+    case Intrinsic::visc_atomic_add: return AtomicRMWInst::Add;
+    case Intrinsic::visc_atomic_sub: return AtomicRMWInst::Sub;
+    case Intrinsic::visc_atomic_min: return AtomicRMWInst::Min;
+    case Intrinsic::visc_atomic_max: return AtomicRMWInst::Max;
+    //case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
+    //case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
+    case Intrinsic::visc_atomic_xchg: return AtomicRMWInst::Xchg;
+    case Intrinsic::visc_atomic_and: return AtomicRMWInst::And;
+    case Intrinsic::visc_atomic_or: return AtomicRMWInst::Or;
+    case Intrinsic::visc_atomic_xor: return AtomicRMWInst::Xor;
+    default:
+      llvm_unreachable("Unsupported atomic intrinsic!");
+  };
+}
+
+
+// Helper funtion, returns the OpenCL function name, corresponding to atomic op
+static std::string getAtomicOpName(Intrinsic::ID ID) {
+  switch(ID) {
+    case Intrinsic::visc_atomic_cmpxchg: return "atom_cmpxchg";
+    case Intrinsic::visc_atomic_add: return "atom_add";
+    case Intrinsic::visc_atomic_sub: return "atom_sub";
+    case Intrinsic::visc_atomic_min: return "atom_min";
+    case Intrinsic::visc_atomic_max: return "atom_max";
+    case Intrinsic::visc_atomic_inc: return "atom_inc";
+    case Intrinsic::visc_atomic_dec: return "atom_dec";
+    case Intrinsic::visc_atomic_xchg: return "atom_xchg";
+    case Intrinsic::visc_atomic_and: return "atom_and";
+    case Intrinsic::visc_atomic_or: return "atom_or";
+    case Intrinsic::visc_atomic_xor: return "atom_xor";
+    default:
+      llvm_unreachable("Unsupported atomic intrinsic!");
+  };
+}
+
 } // End of namespace
 
 char DFG2LLVM_NVPTX::ID = 0;
diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
index cac740e4a2..8957ebe2c8 100644
--- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -37,13 +37,54 @@ static void transformReturnTypeToStruct(Function* F);
 
 // Check if the dummy function call is a __visc__node call
 #define IS_VISC_CALL(callName) \
-  static bool isVISC##callName##Call(Instruction* I) { \
+  static bool isVISCCall_##callName(Instruction* I) { \
     if(!isa<CallInst>(I)) \
       return false; \
     CallInst* CI = cast<CallInst>(I); \
     return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \
   }
 
+static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) {
+  // Check if the instruction is Call Instruction
+  assert(isa<CallInst>(I) && "Expecting CallInst");
+  CallInst* CI = cast<CallInst>(I);
+  DEBUG(errs() << "Found call: " << *CI << "\n");
+
+  // Find the correct intrinsic call
+  Module* M = CI->getParent()->getParent()->getParent();
+  Function* F = Intrinsic::getDeclaration(M, IntrinsicID);
+  FunctionType* FTy = F->getFunctionType();
+  DEBUG(errs() << *F << "\n");
+
+  // Create argument list
+  assert(CI->getNumArgOperands() == FTy->getNumParams()
+      && "Number of arguments of call do not match with Intrinsic");
+  std::vector<Value*> args;
+  for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+    Value* V = CI->getArgOperand(i);
+    // Either the type should match or both should be of pointer type
+    assert(V->getType() == FTy->getParamType(i) ||
+        (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy())
+        && "Dummy function call argument does not match with Intrinsic argument!");
+    // If the types do not match, then both must be pointer type and pointer
+    // cast needs to be performed
+    if(V->getType() != FTy->getParamType(i)) {
+      V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
+    }
+    args.push_back(V);
+  }
+  // Insert call instruction
+  CallInst* Inst = CallInst::Create(F, args, CI->getName(), CI);
+
+  DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
+
+  CI->replaceAllUsesWith(Inst);
+  // If the previous instruction needs to be erased, insert it in the vector
+  // Erased
+  if(Erase != NULL)
+    Erase->push_back(CI);
+}
+
 IS_VISC_CALL(launch) /* Exists but not required */
 IS_VISC_CALL(edge) /* Exists but not required */
 IS_VISC_CALL(createNode)
@@ -65,6 +106,18 @@ IS_VISC_CALL(getNodeInstanceID_z)
 IS_VISC_CALL(getNumNodeInstances_x)
 IS_VISC_CALL(getNumNodeInstances_y)
 IS_VISC_CALL(getNumNodeInstances_z)
+// Atomics
+IS_VISC_CALL(atomic_cmpxchg)
+IS_VISC_CALL(atomic_add)
+IS_VISC_CALL(atomic_sub)
+IS_VISC_CALL(atomic_xchg)
+IS_VISC_CALL(atomic_inc)
+IS_VISC_CALL(atomic_dec)
+IS_VISC_CALL(atomic_min)
+IS_VISC_CALL(atomic_max)
+IS_VISC_CALL(atomic_and)
+IS_VISC_CALL(atomic_or)
+IS_VISC_CALL(atomic_xor)
 
 IS_VISC_CALL(init)
 IS_VISC_CALL(node)
@@ -119,7 +172,7 @@ static void addArgs(Function* F, unsigned numArgs, std::string names[]) {
 // values being returned into a struct and returning it
 static Value* genCodeForReturn(CallInst* CI) {
   LLVMContext& Ctx = CI->getContext();
-  assert(isVISCreturnCall(CI)
+  assert(isVISCCall_return(CI)
       && "__visc__return instruction expected!");
   std::vector<Type*> ArgTypes;
   for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
@@ -133,15 +186,15 @@ static Value* genCodeForReturn(CallInst* CI) {
                                                   0,
                                                   "returnStruct",
                                                   CI);
-  errs() << "Generate Instructin:\n";
-  errs() << *IV << "\n";
+  DEBUG(errs() << "Code generation for return:\n");
+  DEBUG(errs() << *IV << "\n");
   for(unsigned i=1; i < CI->getNumArgOperands(); i++) {
     IV = InsertValueInst::Create(IV,
                                  CI->getArgOperand(i),
                                  i,
                                  IV->getName(),
                                  CI);
-    errs() << *IV << "\n";
+    DEBUG(errs() << *IV << "\n");
   }
   
   return IV;
@@ -279,7 +332,7 @@ static std::vector<CallInst*>* getWaitList(Value* GraphID) {
   for(Value::use_iterator ui = GraphID->use_begin(),
       ue = GraphID->use_end(); ui!=ue; ++ui) {
     if(CallInst* waitI = dyn_cast<CallInst>(*ui)) {
-      assert(isVISCwaitCall(waitI)
+      assert(isVISCCall_wait(waitI)
              && "GraphID can only be used by __visc__wait call");
       WaitList->push_back(waitI);
     }
@@ -726,7 +779,7 @@ bool GenVISC::runOnModule(Module &M) {
   errs() << "\nGENVISC PASS\n";
   this->M = &M;
 
-// Load Runtime API Module
+  // Load Runtime API Module
   SMDiagnostic Err;
 
   char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
@@ -792,7 +845,7 @@ bool GenVISC::runOnModule(Module &M) {
       LLVMContext& Ctx = CI->getContext();
       // If __visc__node call found, generate the test case
 
-      if(isVISCnodeCall(I)) {
+      if(isVISCCall_node(I)) {
         errs() << "Found visc node call in Function: " << f->getName() << "\n";
         assert(CI->getNumArgOperands() >= 5
                && "__visc__node call should have atleast 5 arguments!");
@@ -800,62 +853,25 @@ bool GenVISC::runOnModule(Module &M) {
         // Place this call in the list of instructions to be erased.
         toBeErased.push_back(CI);
       }
-      if(isVISCinitCall(I)) {
-        Function* InitF = Intrinsic::getDeclaration(&M, Intrinsic::visc_init);
-        DEBUG(errs() << *InitF << "\n");
-        CallInst* InitInst = CallInst::Create(InitF,
-                                              None, "", CI);
-        toBeErased.push_back(CI);
-        DEBUG(errs() << "Found visc init call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *InitInst << "\n");
+      if(isVISCCall_init(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased);
       }
-      if(isVISCcleanupCall(I)) {
-        Function* CleanupF = Intrinsic::getDeclaration(&M, Intrinsic::visc_cleanup);
-        CallInst* CleanupInst = CallInst::Create(CleanupF,
-                                None,
-                                "", CI);
-        DEBUG(errs() << "Found visc cleanup call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *CleanupInst << "\n");
-        toBeErased.push_back(CI);
+      if(isVISCCall_cleanup(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased);
       }
-      if(isVISCwaitCall(I)) {
-        Function* WaitF = Intrinsic::getDeclaration(&M, Intrinsic::visc_wait);
-        DEBUG(errs() << *WaitF << "\n");
-        CallInst* WaitInst = CallInst::Create(WaitF,
-                                              ArrayRef<Value*>(CI->getArgOperand(0)),
-                                              "", CI);
-        DEBUG(errs() << "Found visc wait call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *WaitInst << "\n");
-        CI->replaceAllUsesWith(WaitInst);
-        toBeErased.push_back(CI);
+      if(isVISCCall_wait(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased);
       }
-      if(isVISCtrackMemoryCall(I)) {
-        Function* TrackMemoryF = Intrinsic::getDeclaration(&M, Intrinsic::visc_trackMemory);
-        Value* TrackMemArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1)};
-        CallInst* TrackMemInst = CallInst::Create(TrackMemoryF,
-                                 ArrayRef<Value*>(TrackMemArgs,2),
-                                 "", CI);
-        DEBUG(errs() << "Found visc track memory call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *TrackMemInst << "\n");
+      if(isVISCCall_trackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased);
       }
-      if(isVISCuntrackMemoryCall(I)) {
-        Function* UntrackMemoryF = Intrinsic::getDeclaration(&M, Intrinsic::visc_untrackMemory);
-        CallInst* UntrackMemInst = CallInst::Create(UntrackMemoryF,
-                                   ArrayRef<Value*>(CI->getArgOperand(0)),
-                                   "", CI);
-        DEBUG(errs() << "Found visc *un*track memory call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *UntrackMemInst << "\n");
+      if(isVISCCall_untrackMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased);
       }
-      if(isVISCrequestMemoryCall(I)) {
-        Function* RequestMemoryF = Intrinsic::getDeclaration(&M, Intrinsic::visc_requestMemory);
-        Value* RequestMemArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1)};
-        CallInst* RequestMemInst = CallInst::Create(RequestMemoryF,
-                                   ArrayRef<Value*>(RequestMemArgs,2),
-                                   "", CI);
-        DEBUG(errs() << "Found visc request memory call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *RequestMemInst << "\n");
+      if(isVISCCall_requestMemory(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased);
       }
-      if(isVISChintCall(I)) {
+      if(isVISCCall_hint(I)) {
         assert(isa<ConstantInt>(CI->getArgOperand(0))
                && "Argument to hint must be constant integer!");
         ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0));
@@ -866,7 +882,7 @@ bool GenVISC::runOnModule(Module &M) {
         DEBUG(errs() << "Found visc hint call: " << *CI << "\n");
         toBeErased.push_back(CI);
       }
-      if(isVISClaunchCall(I)) {
+      if(isVISCCall_launch(I)) {
         Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
         DEBUG(errs() << *LaunchF << "\n");
         // Get i8* cast to function pointer
@@ -886,33 +902,14 @@ bool GenVISC::runOnModule(Module &M) {
         DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
         CI->replaceAllUsesWith(LaunchInst);
         toBeErased.push_back(CI);
-
       }
-      if(isVISCpushCall(I)) {
-        Function* PushF = Intrinsic::getDeclaration(&M, Intrinsic::visc_push);
-        DEBUG(errs() << *PushF << "\n");
-
-        Value* PushArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1)};
-        CallInst* PushInst = CallInst::Create(PushF,
-                                              ArrayRef<Value*>(PushArgs, 2),
-                                              "", CI);
-        DEBUG(errs() << "Found visc push call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *PushInst << "\n");
-        CI->replaceAllUsesWith(PushInst);
-        toBeErased.push_back(CI);
+      if(isVISCCall_push(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased);
       }
-      if(isVISCpopCall(I)) {
-        Function* PopF = Intrinsic::getDeclaration(&M, Intrinsic::visc_pop);
-        DEBUG(errs() << *PopF << "\n");
-        CallInst* PopInst = CallInst::Create(PopF,
-                                             ArrayRef<Value*>(CI->getArgOperand(0)),
-                                             "output", CI);
-        DEBUG(errs() << "Found visc pop call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *PopInst << "\n");
-        CI->replaceAllUsesWith(PopInst);
-        toBeErased.push_back(CI);
+      if(isVISCCall_pop(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased);
       }
-      if(isVISCcreateNodeCall(I)) {
+      if(isVISCCall_createNode(I)) {
         Function* CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
         DEBUG(errs() << *CreateNodeF << "\n");
 
@@ -930,7 +927,7 @@ bool GenVISC::runOnModule(Module &M) {
         toBeErased.push_back(CI);
       }
 
-      if(isVISCcreateNode1DCall(I)) {
+      if(isVISCCall_createNode1D(I)) {
         Function* CreateNode1DF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
         DEBUG(errs() << *CreateNode1DF << "\n");
 
@@ -948,7 +945,7 @@ bool GenVISC::runOnModule(Module &M) {
         CI->replaceAllUsesWith(CreateNode1DInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCcreateNode2DCall(I)) {
+      if(isVISCCall_createNode2D(I)) {
         Function* CreateNode2DF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
         DEBUG(errs() << *CreateNode2DF << "\n");
 
@@ -966,7 +963,7 @@ bool GenVISC::runOnModule(Module &M) {
         CI->replaceAllUsesWith(CreateNode2DInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCcreateNode3DCall(I)) {
+      if(isVISCCall_createNode3D(I)) {
         Function* CreateNode3DF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
         DEBUG(errs() << *CreateNode3DF << "\n");
 
@@ -987,7 +984,7 @@ bool GenVISC::runOnModule(Module &M) {
         toBeErased.push_back(CI);
       }
 
-      if(isVISCedgeCall(I)) {
+      if(isVISCCall_edge(I)) {
         Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
         DEBUG(errs() << *EdgeF << "\n");
         ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(4));
@@ -1005,7 +1002,7 @@ bool GenVISC::runOnModule(Module &M) {
         CI->replaceAllUsesWith(EdgeInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCbindInCall(I)) {
+      if(isVISCCall_bindIn(I)) {
         Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
         DEBUG(errs() << *BindInF << "\n");
         // Check if this is a streaming bind or not
@@ -1023,7 +1020,7 @@ bool GenVISC::runOnModule(Module &M) {
         CI->replaceAllUsesWith(BindInInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCbindOutCall(I)) {
+      if(isVISCCall_bindOut(I)) {
         Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
         DEBUG(errs() << *BindOutF << "\n");
         // Check if this is a streaming bind or not
@@ -1041,55 +1038,24 @@ bool GenVISC::runOnModule(Module &M) {
         CI->replaceAllUsesWith(BindOutInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCattributesCall(I)) {
+      if(isVISCCall_attributes(I)) {
         Function* F = CI->getParent()->getParent();
         handleVISCAttributes(F, CI);
         toBeErased.push_back(CI);
       }
-      if (isVISCgetNodeCall(I)) {
-        Function* GetNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getNode);
-        DEBUG(errs() << *GetNodeF << "\n");
-        CallInst* GetNodeInst = CallInst::Create(GetNodeF,
-                                None, "this.node", CI);
-        DEBUG(errs() << "Found visc getNode call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *GetNodeInst << "\n");
-        CI->replaceAllUsesWith(GetNodeInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNode, &toBeErased);
       }
-      if (isVISCgetParentNodeCall(I)) {
-        Function* GetParentNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getParentNode);
-        DEBUG(errs() << *GetParentNodeF << "\n");
-        CallInst* GetParentNodeInst = CallInst::Create(GetParentNodeF,
-                                ArrayRef<Value*>(CI->getArgOperand(0)),
-                                "this.node", CI);
-        DEBUG(errs() << "Found visc getParentNode call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *GetParentNodeInst << "\n");
-        CI->replaceAllUsesWith(GetParentNodeInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getParentNode(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getParentNode, &toBeErased);
       }
-      if (isVISCbarrierCall(I)) {
-        Function* BarrierF = Intrinsic::getDeclaration(&M, Intrinsic::visc_barrier);
-        DEBUG(errs() << *BarrierF << "\n");
-        CallInst* BarrierInst = CallInst::Create(BarrierF,
-                                None, "", CI);
-        DEBUG(errs() << "Found visc barrier call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *BarrierInst << "\n");
-        CI->replaceAllUsesWith(BarrierInst);
-        toBeErased.push_back(CI);
+      if (isVISCCall_barrier(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_barrier, &toBeErased);
       }
-      if (isVISCmallocCall(I)) {
-        Function* MallocF = Intrinsic::getDeclaration(&M, Intrinsic::visc_malloc);
-        DEBUG(errs() << *MallocF << "\n");
-        CallInst* MallocInst = CallInst::Create(MallocF,
-                                CI->getArgOperand(0), "", CI);
-        DEBUG(errs() << "Found visc malloc call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *MallocInst << "\n");
-        CI->replaceAllUsesWith(MallocInst);
-        toBeErased.push_back(CI);
+      if (isVISCCall_malloc(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased);
       }
-      if (isVISCreturnCall(I)) {
+      if (isVISCCall_return(I)) {
         // The operands to this call are the values to be returned by the node
         Value* ReturnVal = genCodeForReturn(CI);
         DEBUG(errs() << *ReturnVal << "\n");
@@ -1118,95 +1084,68 @@ bool GenVISC::runOnModule(Module &M) {
             && "Multiple returns with mismatching types");
         
         ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal);
-        errs() << "Found visc return call: " << *CI << "\n";
+        DEBUG(errs() << "Found visc return call: " << *CI << "\n");
         Instruction* oldReturn = CI->getParent()->getTerminator();
         assert(isa<ReturnInst>(oldReturn)
                 && "Expecting a return to be the terminator of this BB!");
-        errs() << "Found return statement of BB: " << *oldReturn << "\n";
-        errs() << "\tSubstitute return with: " << *RetInst << "\n";
+        DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
+        DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
         //CI->replaceAllUsesWith(RetInst);
         toBeErased.push_back(CI);
         ReplaceInstWithInst(oldReturn, RetInst);
       }
 
-      if (isVISCgetNodeInstanceID_xCall(I)) {
-        Function* NodeInstanceID_xF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getNodeInstanceID_x);
-        DEBUG(errs() << *NodeInstanceID_xF << "\n");
-        CallInst* NodeInstanceID_xInst = CallInst::Create(NodeInstanceID_xF,
-                                ArrayRef<Value*>(CI->getArgOperand(0)),
-                                "instanceID_x", CI);
-        DEBUG(errs() << "Found visc getNodeInstanceID_x call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *NodeInstanceID_xInst << "\n");
-        CI->replaceAllUsesWith(NodeInstanceID_xInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getNodeInstanceID_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased);
       }
-      if (isVISCgetNodeInstanceID_yCall(I)) {
-        Function* NodeInstanceID_yF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getNodeInstanceID_y);
-        DEBUG(errs() << *NodeInstanceID_yF << "\n");
-        // Check if this is a streaming bind or not
-        CallInst* NodeInstanceID_yInst = CallInst::Create(NodeInstanceID_yF,
-                                ArrayRef<Value*>(CI->getArgOperand(0)),
-                                "instanceID_x", CI);
-        DEBUG(errs() << "Found visc getNodeInstanceID_y call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *NodeInstanceID_yInst << "\n");
-        CI->replaceAllUsesWith(NodeInstanceID_yInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getNodeInstanceID_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased);
       }
-      if (isVISCgetNodeInstanceID_zCall(I)) {
-        Function* NodeInstanceID_zF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getNodeInstanceID_z);
-        DEBUG(errs() << *NodeInstanceID_zF << "\n");
-        // Check if this is a streaming bind or not
-        CallInst* NodeInstanceID_zInst = CallInst::Create(NodeInstanceID_zF,
-                                ArrayRef<Value*>(CI->getArgOperand(0)),
-                                "instanceID_x", CI);
-        DEBUG(errs() << "Found visc getNodeInstanceID_z call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *NodeInstanceID_zInst << "\n");
-        CI->replaceAllUsesWith(NodeInstanceID_zInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getNodeInstanceID_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased);
       }
-      if (isVISCgetNumNodeInstances_xCall(I)) {
-        Function* NumNodeInstances_xF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getNumNodeInstances_x);
-        DEBUG(errs() << *NumNodeInstances_xF << "\n");
-        // Check if this is a streaming bind or not
-        CallInst* NumNodeInstances_xInst = CallInst::Create(NumNodeInstances_xF,
-                                ArrayRef<Value*>(CI->getArgOperand(0)),
-                                "instanceID_x", CI);
-        DEBUG(errs() << "Found visc getNumNodeInstances_x call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *NumNodeInstances_xInst << "\n");
-        CI->replaceAllUsesWith(NumNodeInstances_xInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getNumNodeInstances_x(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased);
       }
-      if (isVISCgetNumNodeInstances_yCall(I)) {
-        Function* NumNodeInstances_yF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getNumNodeInstances_y);
-        DEBUG(errs() << *NumNodeInstances_yF << "\n");
-        // Check if this is a streaming bind or not
-        CallInst* NumNodeInstances_yInst = CallInst::Create(NumNodeInstances_yF,
-                                ArrayRef<Value*>(CI->getArgOperand(0)),
-                                "instanceID_x", CI);
-        DEBUG(errs() << "Found visc getNumNodeInstances_y call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *NumNodeInstances_yInst << "\n");
-        CI->replaceAllUsesWith(NumNodeInstances_yInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getNumNodeInstances_y(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased);
       }
-      if (isVISCgetNumNodeInstances_zCall(I)) {
-        Function* NumNodeInstances_zF = Intrinsic::getDeclaration(&M, Intrinsic::visc_getNumNodeInstances_z);
-        DEBUG(errs() << *NumNodeInstances_zF << "\n");
-        // Check if this is a streaming bind or not
-        CallInst* NumNodeInstances_zInst = CallInst::Create(NumNodeInstances_zF,
-                                ArrayRef<Value*>(CI->getArgOperand(0)),
-                                "instanceID_x", CI);
-        DEBUG(errs() << "Found visc getNumNodeInstances_z call: " << *CI << "\n");
-        DEBUG(errs() << "\tSubstitute with: " << *NumNodeInstances_zInst << "\n");
-        CI->replaceAllUsesWith(NumNodeInstances_zInst);
-        toBeErased.push_back(CI);
-       
+      if (isVISCCall_getNumNodeInstances_z(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased);
+      }
+      if (isVISCCall_atomic_cmpxchg(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg, &toBeErased);
+      }
+      if (isVISCCall_atomic_add(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased);
+      }
+      if (isVISCCall_atomic_sub(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_sub, &toBeErased);
+      }
+      if (isVISCCall_atomic_xchg(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xchg, &toBeErased);
+      }
+      if (isVISCCall_atomic_inc(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_inc, &toBeErased);
+      }
+      if (isVISCCall_atomic_dec(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_dec, &toBeErased);
+      }
+      if (isVISCCall_atomic_min(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_min, &toBeErased);
+      }
+      if (isVISCCall_atomic_max(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_max, &toBeErased);
+      }
+      if (isVISCCall_atomic_and(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_and, &toBeErased);
+      }
+      if (isVISCCall_atomic_or(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_or, &toBeErased);
+      }
+      if (isVISCCall_atomic_xor(I)) {
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_xor, &toBeErased);
       }
-
     }
   }
 
@@ -1299,7 +1238,7 @@ void GenVISC::genKernel(Function* KernelF, CallInst* CI, StructType* RetTy) {
   // in/out to pointer arguments
   for (inst_iterator i = inst_begin(KernelF), e = inst_end(KernelF); i != e; ++i) {
     Instruction *I = &(*i);
-    if(isVISCattributesCall(I)) {
+    if(isVISCCall_attributes(I)) {
       handleVISCAttributes(KernelF, cast<CallInst>(I));
       //I->eraseFromParent();
       break;
@@ -1484,9 +1423,9 @@ static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
 
 static void transformReturnTypeToStruct(Function* F) {
   // Currently only works for void return types
-  errs() << "Transforming return type of function to Struct: " << F->getName() << "\n";
+  DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
   if(!F->getReturnType()->isVoidTy()) {
-    errs() << "Warning: Unhandled case - Only void return type handled\n";
+    errs() << "Warning: Unhandled case - Only void return type handled. Function: " << F->getName() << "\n";
     return;
   }
   // Create the argument type list with added argument types
diff --git a/llvm/lib/Transforms/LocalMem/LocalMem.cpp b/llvm/lib/Transforms/LocalMem/LocalMem.cpp
index 7041a3bf36..0de36b51cb 100644
--- a/llvm/lib/Transforms/LocalMem/LocalMem.cpp
+++ b/llvm/lib/Transforms/LocalMem/LocalMem.cpp
@@ -74,7 +74,7 @@ public:
 };
 
 bool LocalMem::runOnModule(Module &M) {
-  errs() << "\nLocalMem PASS\n";
+  errs() << "\nLOCALMEM PASS\n";
 
   // Get the BuildDFG Analysis Results:
   // - Dataflow graph
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index b087ceff6a..856ce0ea03 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -153,9 +153,9 @@ static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL
   else                    clFlags = CL_MEM_READ_ONLY;
 
   visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
-
   cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
+  DEBUG(cout<< "\nMemory allocated on device: " << d_input << "\n");
   if(isInput)
     errcode = clEnqueueWriteBuffer(Context->clCommandQue,
                                   d_input,
@@ -1252,6 +1252,16 @@ void llvm_visc_ocl_clearContext(void* graphID) {
 
 }
 
+void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) {
+  DEBUG(cout << "Set Shared Memory Input:");
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << "\n");
+  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DEBUG(cout << "Using Context: " << Context << "\n");
+  DEBUG(cout << "Using clKernel: " << Context->clKernel << "\n");
+  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument");
+}
+
 void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) {
   DEBUG(cout << "Set Scalar Input:");
   DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << "\n");
diff --git a/llvm/projects/visc-rt/visc-rt.h b/llvm/projects/visc-rt/visc-rt.h
index 68bd51d45d..aa8745560a 100644
--- a/llvm/projects/visc-rt/visc-rt.h
+++ b/llvm/projects/visc-rt/visc-rt.h
@@ -161,6 +161,7 @@ void llvm_visc_x86_wait(void*);
 void* llvm_visc_ocl_initContext(enum visc::Target);
 
 void llvm_visc_ocl_clearContext(void*);
+void llvm_visc_ocl_argument_shared(void*, int, size_t);
 void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t);
 void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool);
 void* llvm_visc_ocl_output_ptr(void*, int, size_t);
diff --git a/llvm/test/VISC/parboil/.ycm_extra_conf.py b/llvm/test/VISC/parboil/.ycm_extra_conf.py
index 3615b034aa..bccfaddfeb 100644
--- a/llvm/test/VISC/parboil/.ycm_extra_conf.py
+++ b/llvm/test/VISC/parboil/.ycm_extra_conf.py
@@ -51,6 +51,7 @@ flags = [
     '-I./include',
     '-isystem', '/opt/intel/opencl-sdk/include'
     '-isystem', '/usr/local/cuda/include',
+    '-isystem', '/home/psrivas2/current-src/include',
 ]
 
 # Set this to the absolute path to the folder (NOT the file!) containing the
diff --git a/llvm/test/VISC/parboil/common/include/visc.h b/llvm/test/VISC/parboil/common/include/visc.h
index ea706ccd0c..d407f256dc 100644
--- a/llvm/test/VISC/parboil/common/include/visc.h
+++ b/llvm/test/VISC/parboil/common/include/visc.h
@@ -40,6 +40,31 @@ unsigned __visc__getNumNodeInstances_x(void*);
 unsigned __visc__getNumNodeInstances_y(void*);
 unsigned __visc__getNumNodeInstances_z(void*);
 
+// Atomic
+// signed int
+int __visc__atomic_cmpxchg(int*, int, int);
+int __visc__atomic_add(int*, int);
+int __visc__atomic_sub(int*, int);
+int __visc__atomic_xchg(int*, int);
+int __visc__atomic_inc(int*);
+int __visc__atomic_dec(int*);
+int __visc__atomic_min(int*, int);
+int __visc__atomic_max(int*, int);
+int __visc__atomic_and(int*, int);
+int __visc__atomic_or(int*, int);
+int __visc__atomic_xor(int*, int);
+// unsigned int
+//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
+//unsigned __visc__atomic_add(unsigned*, unsigned);
+//unsigned __visc__atomic_sub(unsigned*, unsigned);
+//unsigned __visc__atomic_xchg(unsigned*, unsigned);
+//unsigned __visc__atomic_inc(unsigned*);
+//unsigned __visc__atomic_dec(unsigned*);
+//unsigned __visc__atomic_min(unsigned*, unsigned);
+//unsigned __visc__atomic_max(unsigned*, unsigned);
+//unsigned __visc__atomic_and(unsigned*, unsigned);
+//unsigned __visc__atomic_or(unsigned*, unsigned);
+//unsigned __visc__atomic_xor(unsigned*, unsigned);
 
 
 #endif
-- 
GitLab