diff --git a/llvm/include/llvm/IR/IntrinsicsVISC.td b/llvm/include/llvm/IR/IntrinsicsVISC.td
index 40f2a3a9ecbe7b777e6910e9a37db43b163165d9..3ff99354f084c1fc887fc2a9e7d82aec2396b8bd 100644
--- a/llvm/include/llvm/IR/IntrinsicsVISC.td
+++ b/llvm/include/llvm/IR/IntrinsicsVISC.td
@@ -124,38 +124,32 @@ let TargetPrefix = "visc" in {
 
   /* Find the unique indentifier of a dataflow node (with respect to his parent
    * node) in the specified dimension intrinsic -
-   * i32 llvm.visc.getNodeInstanceID(i8*, i32);
    */
-//  def int_visc_getNodeInstanceID : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty,
-//                                             llvm_i32_ty], []>;
 
-  /* i32 llvm.visc.getNodeInstanceID.[xyz](i8*);
+  /* i64 llvm.visc.getNodeInstanceID.[xyz](i8*);
    */
-  def int_visc_getNodeInstanceID_x : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+  def int_visc_getNodeInstanceID_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
                                                [IntrNoMem]>;
 
-  def int_visc_getNodeInstanceID_y : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+  def int_visc_getNodeInstanceID_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
                                                [IntrNoMem]>;
 
-  def int_visc_getNodeInstanceID_z : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+  def int_visc_getNodeInstanceID_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
                                                [IntrNoMem]>;
 
   /* Find the number of instances of a dataflow node in the specified dimension
    * intrinsic -
-   * i32 llvm.visc.getNumNodeInstances(i8*, i32);
    */
-  /*def int_visc_getNumNodeInstances : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty,*/
-                                               /*llvm_i32_ty], []>;*/
 
-  /* i32 llvm.visc.getNumNodeInstances.[xyz](i8*);
+  /* i64 llvm.visc.getNumNodeInstances.[xyz](i8*);
    */
-  def int_visc_getNumNodeInstances_x : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+  def int_visc_getNumNodeInstances_x : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
                                                  [IntrNoMem]>;
 
-  def int_visc_getNumNodeInstances_y : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+  def int_visc_getNumNodeInstances_y : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
                                                  [IntrNoMem]>;
 
-  def int_visc_getNumNodeInstances_z : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty],
+  def int_visc_getNumNodeInstances_z : Intrinsic<[llvm_i64_ty], [llvm_ptr_ty],
                                                  [IntrNoMem]>;
 
   /* Local Barrier
diff --git a/llvm/include/llvm/SupportVISC/DFG2LLVM.h b/llvm/include/llvm/SupportVISC/DFG2LLVM.h
index fff77dd0f49e41309ac207282c786a874ba15037..5cc8a7956456ac4a7af848dadc31f0918c414dc2 100644
--- a/llvm/include/llvm/SupportVISC/DFG2LLVM.h
+++ b/llvm/include/llvm/SupportVISC/DFG2LLVM.h
@@ -284,6 +284,7 @@ void CodeGenTraversal::initializeTimerSet(Instruction* InsertBefore) {
                                       GlobalValue::CommonLinkage,
                                       Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
                                       Twine("viscTimerSet_")+TargetName);
+    errs() << "New global variable: " << *TimerSet << "\n";
 
     Value* TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
                                           None,
diff --git a/llvm/include/llvm/SupportVISC/VISCUtils.h b/llvm/include/llvm/SupportVISC/VISCUtils.h
index 81fa8a4044cd863ac3ee5c63995e035d54d5e9f7..47e7582e5e878ec373bb958826929ad4e8f593b4 100644
--- a/llvm/include/llvm/SupportVISC/VISCUtils.h
+++ b/llvm/include/llvm/SupportVISC/VISCUtils.h
@@ -1,5 +1,4 @@
-
-
+//
 //===---- DFG2LLVM.h - Header file for "VISC Dataflow Graph to Target" ----===//
 //
 //                     The LLVM Compiler Infrastructure
@@ -29,10 +28,17 @@ static bool isViscCreateNodeIntrinsic(Instruction* I) {
   return (II->getCalledFunction()->getName()).startswith("llvm.visc.createNode");
 }
 
+static bool isViscCreateNodeCall(Instruction* I) {
+  if(!isa<CallInst>(I))
+    return false;
+  CallInst* CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__createNode");
+}
+
 // Creates a new createNode intrinsic, similar to II but with different
 // associated function F instead
-IntrinsicInst* createIdenticalCreateNodeWithDifferentFunction(Function* F,
-                                                            IntrinsicInst* II) {
+IntrinsicInst* createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function* F,
+                                                                       IntrinsicInst* II) {
   Module* M = F->getParent();
 
   // Find which createNode intrinsic we need to create
@@ -79,7 +85,44 @@ IntrinsicInst* createIdenticalCreateNodeWithDifferentFunction(Function* F,
   IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI);
   return CreateNodeII;
 }
+/*
+CallInst* createIdenticalCreateNodeCallWithDifferentFunction(Function* F,
+                                                             CallInst* CI) {
 
+  // Find which createNode function call we need to create
+  Function* CreateNodeF = CI->getCalledValue();
+
+  ArrayRef<Value*> CreateNodeArgs;
+  if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode")) {
+    // This is a createNode call
+    CreateNodeArgs = ArrayRef<Value*>(CreateNodeF);
+  } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode1D")) {
+    // This is a createNode1D call
+    Value* CreateNode1DArgs[] = {CreateNodeF, CI->getArgOperand(1)};
+    CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2);
+  } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode2D")) {
+    // This is a createNode2D call
+    Value* CreateNode2DArgs[] = {CreateNodeF,
+                                 CI->getArgOperand(1),
+                                 CI->getArgOperand(2)};
+    CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3);
+  } else if ((CreateNodeF->stripPointerCasts()->getName()).equals("__visc__createNode3D")) {
+    // This is a createNode3D call
+    Value* CreateNode3DArgs[] = {CreateNodeF,
+                                 CI->getArgOperand(1),
+                                 CI->getArgOperand(2),
+                                 CI->getArgOperand(3)};
+    CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4);
+  } else {
+    assert(false && "Unknown createNode call");
+  }
+
+  CallInst* newCI = CallInst::Create(CreateNodeF,
+                                     CreateNodeArgs,
+                                     F->getName()+".cncall");
+  return newCI;
+}
+*/
 
 // Fix VISC hints for this function
 void fixHintMetadata(Module &M, Function* F, Function* G) {
@@ -126,14 +169,25 @@ void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) {
         if (II->getArgOperand(0) != F)
           continue; // skip it
 
-        errs() << "Fixing use: " << *II << "\n";
         // Otherwise, create a new createNode similar to the other one,
         // but with the changed function as first operand.
         IntrinsicInst* CreateNodeII =
-          createIdenticalCreateNodeWithDifferentFunction(G, II);
+          createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II);
         II->replaceAllUsesWith(CreateNodeII);
         toBeErased.push_back(II);
+      } else if (isViscCreateNodeCall(I)) {
+        CallInst* CI = cast<CallInst>(I);
+        // The found createNode is not associated with the changed function
+        if (CI->getArgOperand(0) != F)
+          continue; // skip it
+
+        errs() << "Fixing use: " << *CI << "\n";
+        errs() << "in function: " << f->getName() << "\n";
+        // Replace use of F with use of G
+        CI->setArgOperand(0, G);
+        errs() << "Fixed use: " << *CI << "\n";
       }
+
     }
 
     for(auto I: toBeErased) {
diff --git a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index 2dc200b1834f2edd3177365b40a024dc6ef57c67..37f74325a74a04e11fad7352115214565c41a689 100644
--- a/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/llvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -100,10 +100,21 @@ bool ClearDFG::runOnModule(Module &M) {
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
   Function* VI = M.getFunction("llvm.visc.init");
+  assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n");
+  for(Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); ui != ue; ui++) {
+    Instruction* I = dyn_cast<Instruction>(*ui);
+    I->eraseFromParent();
+  }
   VI->replaceAllUsesWith(UndefValue::get(VI->getType()));
   VI->eraseFromParent();
 
   Function* VC = M.getFunction("llvm.visc.cleanup");
+  assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n");
+  for(Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); ui != ue; ui++) {
+    Instruction* I = dyn_cast<Instruction>(*ui);
+    I->eraseFromParent();
+  }
+    
   VC->replaceAllUsesWith(UndefValue::get(VC->getType()));
   VC->eraseFromParent();
 
diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index b9645c14afff31837275efec0a4d5e8421fb2f65..f4b7fe250005021a79bbee51fb750f0e6bfb866e 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #define ENABLE_ASSERTS
-#define TARGET_PTX 32
+#define TARGET_PTX 64
 #define GENERIC_ADDRSPACE 0
 #define GLOBAL_ADDRSPACE 1
 #define CONSTANT_ADDRSPACE 4
@@ -164,7 +164,7 @@ class CGT_NVPTX : public CodeGenTraversal {
 
 private:
   //Member variables
-  Module &KernelM;
+  std::unique_ptr<Module> KernelM;
   DFNode* KernelLaunchNode = NULL;
   Kernel* kernel;
 
@@ -202,15 +202,17 @@ private:
 public:
 
   // Constructor
-  CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(*CloneModule(&_M)) {
+  CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(&_M)) {
     init();
     initRuntimeAPI();
+    errs() << "Old module pointer: " << &_M << "\n";
+    errs() << "New module pointer: " <<  KernelM.get() << "\n";
 
     // Copying instead of creating new, in order to preserve required info (metadata)
     // Remove functions, global variables and aliases
     std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>();
-    for (Module::global_iterator mi = KernelM.global_begin(),
-         me = KernelM.global_end(); (mi != me); ++mi) {
+    for (Module::global_iterator mi = KernelM->global_begin(),
+         me = KernelM->global_end(); (mi != me); ++mi) {
       GlobalVariable* gv = &*mi;
       gvv.push_back(gv);
     }
@@ -220,8 +222,8 @@ public:
     }
 
     std::vector<Function*> fv = std::vector<Function*>();
-    for (Module::iterator mi = KernelM.begin(),
-         me = KernelM.end(); (mi != me); ++mi) {
+    for (Module::iterator mi = KernelM->begin(),
+         me = KernelM->end(); (mi != me); ++mi) {
       Function* f = &*mi;
       fv.push_back(f);
     }
@@ -231,8 +233,8 @@ public:
     }
 
     std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>();
-    for (Module::alias_iterator mi = KernelM.alias_begin(),
-         me = KernelM.alias_end(); (mi != me); ++mi) {
+    for (Module::alias_iterator mi = KernelM->alias_begin(),
+         me = KernelM->alias_end(); (mi != me); ++mi) {
       GlobalAlias* a = &*mi;
       av.push_back(a);
     }
@@ -241,11 +243,11 @@ public:
       (*vi)->eraseFromParent();
     }
 
-    changeDataLayout(KernelM);
-    changeTargetTriple(KernelM);
+    changeDataLayout(*KernelM);
+    changeTargetTriple(*KernelM);
 
 
-    DEBUG(errs() << KernelM);
+    DEBUG(errs() << *KernelM);
 
   }
 
@@ -984,7 +986,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
 
   // Insert the cloned function into the kernels module
-  KernelM.getFunctionList().push_back(F_nvptx);
+  KernelM->getFunctionList().push_back(F_nvptx);
 
 
   //TODO: Iterate over all the instructions of F_nvptx and identify the
@@ -1144,7 +1146,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         int numOfDim = ArgDFNode->getNumOfDim();
         DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
-        IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
+        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
         ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
 
         // Replace the result of the intrinsic with the computed value
@@ -1175,7 +1177,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
         // Argument of the function to be called
         ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM.getContext()), dim);
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
         //ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
@@ -1185,6 +1187,10 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         DEBUG(errs() << "Parent Level = " << parentLevel << "\n");
         DEBUG(errs() << "Parent Repl factor = " << parentReplFactor << "\n");
 
+        FunctionType* FT =
+          FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                            Type::getInt32Ty(KernelM->getContext()),
+                            false);
         if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
@@ -1193,32 +1199,20 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           // itself
           DEBUG(errs() << "Substitute with get_global_id()\n");
           DEBUG(errs() << *II << "\n");
-          FunctionType* FT =
-            FunctionType::get(Type::getInt32Ty(KernelM.getContext()),
-                              std::vector<Type*>(1, Type::getInt32Ty(KernelM.getContext())),
-                              false);
           OpenCLFunction = cast<Function>
-                           (KernelM.getOrInsertFunction(StringRef("get_global_id"), FT));
+                           (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT));
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           //DEBUG(errs() << "Here inside cond 2\n");
           // We are asking for this node's id with respect to its parent
           // this is a local id call
-          FunctionType* FT =
-            FunctionType::get(Type::getInt32Ty(KernelM.getContext()),
-                              std::vector<Type*>(1, Type::getInt32Ty(KernelM.getContext())),
-                              false);
           OpenCLFunction = cast<Function>
-                           (KernelM.getOrInsertFunction(StringRef("get_local_id"), FT));
+                           (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT));
           //DEBUG(errs() << "exiting condition 2\n");
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's id with respect to its
           // parent: this is a group id call
-          FunctionType* FT =
-            FunctionType::get(Type::getInt32Ty(KernelM.getContext()),
-                              std::vector<Type*>(1, Type::getInt32Ty(KernelM.getContext())),
-                              false);
           OpenCLFunction = cast<Function>
-                           (KernelM.getOrInsertFunction(StringRef("get_group_id"), FT));
+                           (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT));
         } else {
           errs() << N->getFuncPointer()->getName() << "\n";
           errs() << N->getParent()->getFuncPointer()->getName() << "\n";
@@ -1267,7 +1261,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
 
         // Argument of the function to be called
         ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM.getContext()), dim);
+          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
         //ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
@@ -1275,8 +1269,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         int parentLevel = ParentDFNode->getLevel();
         int parentReplFactor = ParentDFNode->getNumOfDim();
         FunctionType* FT =
-            FunctionType::get(Type::getInt32Ty(KernelM.getContext()),
-                              Type::getInt32Ty(KernelM.getContext()),
+            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                              Type::getInt32Ty(KernelM->getContext()),
                               false);
 
         if ((N == ArgDFNode) && (!parentLevel || !parentReplFactor)) {
@@ -1284,17 +1278,17 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           // replicated. This indicates that the parent node is the kernel
           // launch, so the instances are global_size (gridDim x blockDim)
           OpenCLFunction = cast<Function>
-                           (KernelM.getOrInsertFunction(StringRef("get_global_size"), FT));
+                           (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT));
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's instances
           // this is a local size (block dim) call
           OpenCLFunction = cast<Function>
-                           (KernelM.getOrInsertFunction(StringRef("get_local_size"), FT));
+                           (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT));
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's instances
           // this is a (global_size/local_size) (grid dim) call
           OpenCLFunction = cast<Function>
-                           (KernelM.getOrInsertFunction(StringRef("get_num_groups"), FT));
+                           (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT));
         } else {
           assert(false && "Unable to translate getNumNodeInstances intrinsic");
         }
@@ -1312,13 +1306,13 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n");
         DEBUG(errs() << "Substitute with barrier()\n");
         DEBUG(errs() << *II << "\n");
-        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM.getContext()),
-                                             std::vector<Type*>(1, Type::getInt32Ty(KernelM.getContext())),
+        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
+                                             std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
                                              false);
         Function* OpenCLFunction = cast<Function>
-                                   (KernelM.getOrInsertFunction(StringRef("barrier"), FT));
+                                   (KernelM->getOrInsertFunction(StringRef("barrier"), FT));
         CallInst* CI = CallInst::Create(OpenCLFunction,
-                                        ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM.getContext()), 1)),
+                                        ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
                                         "", II);
         II->replaceAllUsesWith(CI);
         IItoRemove.push_back(II);
@@ -1373,7 +1367,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       if(calleeF->isDeclaration()) {
         // Add the declaration to kernel module
         DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-        KernelM.getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
+        KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
         if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
           // Now handle a few specific intrinsics
           // For now, sin and cos are translated to their libclc equivalent
@@ -1393,10 +1387,10 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
               name = "_Z3cosf";
 
             FunctionType* SinCosFT = FunctionType::get(II->getType(),
-                                     Type::getFloatTy(KernelM.getContext()),
+                                     Type::getFloatTy(KernelM->getContext()),
                                      false);
             Function* LibclcFunction = cast<Function>
-                                       (KernelM.getOrInsertFunction(name, SinCosFT));
+                                       (KernelM->getOrInsertFunction(name, SinCosFT));
             CallInst* CI = CallInst::Create(LibclcFunction, II->getArgOperand(0), II->getName(), II);
 
             II->replaceAllUsesWith(CI);
@@ -1406,7 +1400,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           case Intrinsic::floor:
           {
             DEBUG(errs() << "Found floor intrinsic\n");
-            F = Intrinsic::getDeclaration(&KernelM, Intrinsic::nvvm_floor_f);
+            F = Intrinsic::getDeclaration(KernelM.get(), Intrinsic::nvvm_floor_f);
             FunctionType* FTy = F->getFunctionType();
             DEBUG(errs() << *F << "\n");
 
@@ -1446,7 +1440,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         ValueToValueMapTy VMap;
         Function* newCalleeF = CloneFunction(calleeF, VMap);
         newCalleeF->removeFromParent(); //TODO: MARIA check
-        KernelM.getFunctionList().push_back(newCalleeF);
+        KernelM->getFunctionList().push_back(newCalleeF);
       }
       //TODO: how to handle address space qualifiers in load/store
     }
@@ -1466,7 +1460,7 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   addCLMetadata(F_nvptx);
   kernel->KernelFunction = F_nvptx;
   errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-  DEBUG(errs() << KernelM);
+  DEBUG(errs() << *KernelM);
 
   return;
 }
@@ -1581,16 +1575,16 @@ void CGT_NVPTX::addCLMetadata(Function *F) {
   // they are not mandatory. In future they might be useful to enable
   // optimizations
 
-  MDTuple *MDKernelNode = MDNode::get(KernelM.getContext(), KernelMD);
-  NamedMDNode *MDN_kernels = KernelM.getOrInsertNamedMetadata("opencl.kernels");
+  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
   MDN_kernels->addOperand(MDKernelNode);
 
-  KernelMD.push_back(MDString::get(KernelM.getContext(), "kernel"));
+  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
   // TODO: Replace 1 with the number of the kernel.
   // Add when support for multiple launces is added
-  KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM.getContext()),1)));
-  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM.getContext(), KernelMD);
-  NamedMDNode *MDN_annotations = KernelM.getOrInsertNamedMetadata("nvvm.annotations");
+  KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
+  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
   MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 
 }
@@ -1612,7 +1606,7 @@ void CGT_NVPTX::writeKernelsModule() {
   Passes.add(
       createPrintModulePass(Out.os()));
 
-  Passes.run(KernelM);
+  Passes.run(*KernelM);
 
   // Declare success.
   Out.keep();
diff --git a/llvm/lib/Transforms/GenVISC/GenVISC.cpp b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
index a8c465269d1695d9ec41b5c24ee96d621108892e..8c0429af57cafd3fd9c102fe540ca568121560df 100644
--- a/llvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/llvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -975,6 +975,7 @@ bool GenVISC::runOnModule(Module &M) {
         Function* CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
         DEBUG(errs() << *CreateNodeF << "\n");
         errs() << *I << "\n";
+        errs() << "in " << I->getParent()->getParent()->getName() << "\n";
 
         // Get i8* cast to function pointer
         Function* graphFunc = cast<Function>(CI->getArgOperand(0));
@@ -1147,6 +1148,7 @@ bool GenVISC::runOnModule(Module &M) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased);
       }
       if (isVISCCall_return(I)) {
+        errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n";
         // The operands to this call are the values to be returned by the node
         Value* ReturnVal = genCodeForReturn(CI);
         DEBUG(errs() << *ReturnVal << "\n");
@@ -1184,6 +1186,7 @@ bool GenVISC::runOnModule(Module &M) {
         //CI->replaceAllUsesWith(RetInst);
         toBeErased.push_back(CI);
         ReplaceInstWithInst(oldReturn, RetInst);
+        errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n";
 
       }
 
@@ -1274,6 +1277,7 @@ bool GenVISC::runOnModule(Module &M) {
     }
 
     if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) {
+        errs() << "Function before fixing return type\n" << *f << "\n";
         // Argument type list.
         std::vector<Type*> FArgTypes;
         for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
@@ -1282,8 +1286,9 @@ bool GenVISC::runOnModule(Module &M) {
         }
 
         errs() << "Fixing return statements of Function " << f->getName() << "\n";
+
         Type* ReturnType = f->getReturnType();
-        DEBUG(errs() << *ReturnType << "\n";);
+        errs() << *ReturnType << "\n";
         assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType))
                && "Return type should either be a struct or void type!");
 
@@ -1323,6 +1328,8 @@ bool GenVISC::runOnModule(Module &M) {
         errs() <<  "New: " << newF->getName() << "\n";
         errs() <<  "New Type: " << *newF->getType() << "\n";
         replaceNodeFunctionInIR(*f->getParent(), f, newF);
+        errs() << "Function after fixing return type\n" << *newF << "\n";
+        // Argument type list.
     }
 
 
@@ -1580,6 +1587,12 @@ static Function* transformReturnTypeToStruct(Function* F) {
   // Currently only works for void return types
   DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
 
+  if (isa<StructType>(F->getReturnType())) {
+    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n");
+    errs() << "Return type is already a Struct:\nFunction:\n " << *F << ":\n\tReturn Type: " << *F->getReturnType() << "\n";
+    return F;
+  }
+
   assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n");
 
   // Create the argument type list with added argument types
diff --git a/llvm/projects/visc-rt/visc-rt.cpp b/llvm/projects/visc-rt/visc-rt.cpp
index fe8d7ad566f6dc862c101f521938366ad1bec9f9..4cb652135b6b50060c8b7a8ed8979f68e5ce5ced 100644
--- a/llvm/projects/visc-rt/visc-rt.cpp
+++ b/llvm/projects/visc-rt/visc-rt.cpp
@@ -1327,6 +1327,11 @@ void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size
   pthread_mutex_lock(&ocl_mtx);
   // Set Kernel Argument
   //pthread_mutex_lock(&ocl_mtx);
+  cout << "clKernel:" << Context->clKernel << "\n";
+  cout << "arg_index:" << arg_index << "\n";
+  cout << "sizeof(clmem):" << sizeof(cl_mem) << "\n";
+  cout << "sizeof(d_input):" << sizeof(d_input) << "\n";
+  cout << "&d_input:" << (void*) d_input << "\n";
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
   //pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl
index f376a27d90003e3c7c18dafb9f64a8b459a40029..5ee2fd5df0a57bc59c1f714bc3efb6b3670b0386 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/kernel.cl
@@ -16,10 +16,10 @@ __kernel void mysgemmNT( __global const float *A, int lda, __global const float
     int m = get_global_id(0);
     int n = get_global_id(1);
 
-    for (int i = 0; i < k; ++i) {
-	float a = A[m + i * lda]; 
-	float b = B[n + i * ldb];
-	c += a * b;
-    }
-    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
+    /*for (int i = 0; i < k; ++i) {*/
+	/*float a = A[m + i * lda]; */
+	/*float b = B[n + i * ldb];*/
+	/*c += a * b;*/
+    /*}*/
+    C[m+n*ldc] = beta + alpha * c;
 }
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
index cd91d26735295328745a975df42706cf6574b149..b20c6a996a815f818e25091073a94cd41939a1ec 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/opencl_base/main.cc
@@ -145,6 +145,19 @@ int main (int argc, char *argv[]) {
   clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
   CHECK_ERROR("clBuildProgram")
 
+  size_t binarySizes = 0;
+  clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySizes, NULL);
+  CHECK_ERROR("clGetProgramInfo")
+
+  std::cout << "Binary Size = " << binarySizes << "\n";
+
+  unsigned char* binaries = (unsigned char*) malloc(binarySizes);
+  clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySizes, &binaries, NULL);
+  CHECK_ERROR("clGetProgramInfo")
+
+  std::cout << "Binary = \n" << binaries << "\n";
+
+
   cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
   CHECK_ERROR("clCreateKernel")
 
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc
index b2f798a51490007331824b5a9e1237309f9579c2..17eb5656df92dd2f08d89c4d6149e2e5d06526e4 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc/main.cc
@@ -48,10 +48,7 @@ void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k,
     for (int i = 0; i < k; ++i) {
         float a = A[m + i * lda];
         float b = B[n + i * ldb];
-        //if(a>b)
-          c += a * b;
-        //else
-          //c = a*b;
+        c += a * b;
     }
     C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
 }
@@ -95,13 +92,20 @@ int main (int argc, char *argv[]) {
     /* Read command line. Expect 3 inputs: A, B and B^T
        in column-major layout*/
     params = pb_ReadParameters(&argc, argv);
+
+    unsigned iter = 0;
+    while(params->inpFiles[iter] != NULL) {
+        printf("Found input file %d - %s\n", iter, params->inpFiles[iter]);
+        iter++;
+    }
     if ((params->inpFiles[0] == NULL)
             || (params->inpFiles[1] == NULL)
             || (params->inpFiles[2] == NULL)
             || (params->inpFiles[3] != NULL))
     {
-        fprintf(stderr, "Expecting three input filenames\n");
+        printf("Expecting three input filenames\n");
         exit(-1);
+        return 0;
     }
 
     /* Read in data */
@@ -109,6 +113,7 @@ int main (int argc, char *argv[]) {
     readColMajorMatrixFile(params->inpFiles[0],
                            matArow, matAcol, matA);
 
+    printf("This is in between two reads\n");
     // load B^T
     readColMajorMatrixFile(params->inpFiles[2],
                            matBcol, matBrow, matBT);
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
index 76e1400650926d78a5820ee5383f915d50a0a45a..663a984783be43eecb50e65a073ce20d06153261 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
@@ -107,13 +107,13 @@ void SgemmLeaf( float* A, size_t bytesA, int lda, float* B, size_t bytesB, int l
     void* thisNode = __visc__getNode();
     void* parentNode = __visc__getParentNode(thisNode);
 
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
+    long lx = __visc__getNodeInstanceID_x(thisNode);
+    long ly = __visc__getNodeInstanceID_y(thisNode);
 
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
+    long gx = __visc__getNodeInstanceID_x(parentNode);
+    long gy = __visc__getNodeInstanceID_y(parentNode);
 
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
+    long dimx = __visc__getNumNodeInstances_x(thisNode);
 
     float c[TILE_N];
     for (int i=0; i < TILE_N; i++)
diff --git a/llvm/test/VISC/parboil/common/include/visc.h b/llvm/test/VISC/parboil/common/include/visc.h
index 62eca0c298ba7f7725080e677b7c61a1b8dec782..f25c3cc4dcf090b6a057c369da732b80895788d0 100644
--- a/llvm/test/VISC/parboil/common/include/visc.h
+++ b/llvm/test/VISC/parboil/common/include/visc.h
@@ -45,12 +45,12 @@ void* __visc__getNode();
 void* __visc__getParentNode(void*);
 void __visc__barrier();
 void* __visc__malloc(long);
-unsigned __visc__getNodeInstanceID_x(void*);
-unsigned __visc__getNodeInstanceID_y(void*);
-unsigned __visc__getNodeInstanceID_z(void*);
-unsigned __visc__getNumNodeInstances_x(void*);
-unsigned __visc__getNumNodeInstances_y(void*);
-unsigned __visc__getNumNodeInstances_z(void*);
+long __visc__getNodeInstanceID_x(void*);
+long __visc__getNodeInstanceID_y(void*);
+long __visc__getNodeInstanceID_z(void*);
+long __visc__getNumNodeInstances_x(void*);
+long __visc__getNumNodeInstances_y(void*);
+long __visc__getNumNodeInstances_z(void*);
 
 // Atomic
 // signed int
@@ -90,10 +90,10 @@ float __visc__cos(float);
 
 #include <unistd.h>
 
-int get_global_id(int);
-int get_group_id(int);
-int get_local_id(int);
-int get_local_size(int);
+long get_global_id(int);
+long get_group_id(int);
+long get_local_id(int);
+long get_local_size(int);
 
 
 void llvm_visc_track_mem(void*, size_t);
diff --git a/llvm/test/VISC/parboil/common/mk/opencl.mk b/llvm/test/VISC/parboil/common/mk/opencl.mk
index 4a05a389bc9795e86418c1bbb252b988b57fda96..3e2df549b749574d14240d1bfd3d6de8f43d2803 100644
--- a/llvm/test/VISC/parboil/common/mk/opencl.mk
+++ b/llvm/test/VISC/parboil/common/mk/opencl.mk
@@ -11,7 +11,7 @@ CXXFLAGS=$(LANG_CXXFLAGS) $(PLATFORM_CXXFLAGS) $(APP_CXXFLAGS)
 LDFLAGS=$(LANG_LDFLAGS) $(PLATFORM_LDFLAGS) $(APP_LDFLAGS)
 
 
-LLVM_INSTALL:=$(LLVM_SRC_ROOT)/Release+Asserts
+LLVM_INSTALL:=$(LLVM_SRC_ROOT)/../build
 LIBCLC:=$(LLVM_SRC_ROOT)/../libclc
 LLVM_CC:=$(LLVM_INSTALL)/bin/clang
 LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
diff --git a/llvm/test/VISC/parboil/common/mk/visc.mk b/llvm/test/VISC/parboil/common/mk/visc.mk
index e3b541378df6d2e1c82410648761995aee63b820..325301aa11dca7998e75df88bc532d3597663ab7 100755
--- a/llvm/test/VISC/parboil/common/mk/visc.mk
+++ b/llvm/test/VISC/parboil/common/mk/visc.mk
@@ -14,10 +14,11 @@ LIBCLC_LIB_PATH = $(LLVM_SRC_ROOT)/../../libclc-install/lib/clc
 VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt
 
 VISC_RT_LIB = $(VISC_RT_PATH)/visc-rt.ll
-LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc
+#LIBCLC_NVPTX_LIB = $(LIBCLC_LIB_PATH)/nvptx--nvidiacl.bc
+LIBCLC_NVPTX_LIB = nvptx64--nvidiacl.bc
 
 TESTGEN_OPTFLAGS = -load LLVMGenVISC.so -genvisc -globaldce
-KERNEL_GEN_FLAGS = -O3 -target nvptx
+KERNEL_GEN_FLAGS = -O3 -target nvptx64-nvidia-nvcl
 
 ifeq ($(TARGET),x86)
   DEVICE = SPIR_TARGET