diff --git a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 4fc3934246fedde551eaa09e47493b1e0ebf6084..2961a966afb63e48fa80e241e853d31264abe6b2 100644
--- a/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/llvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker.h"
 #include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/FileSystem.h"
 
 #include <sstream>
 
@@ -36,11 +37,13 @@ using namespace builddfg;
 namespace {
 
 // Helper function declarations
-  void changeDataLayout(Module &);
-  void changeTargetTriple(Module &);
-  std::string printType(Type*);
-  std::string convertInt(int);
-  void findReturnInst(Function *, std::vector<ReturnInst *> &);
+  static std::string getPTXFilename(const Module&);
+  static std::string getFilenameFromModule(const Module& M);
+  static void changeDataLayout(Module &);
+  static void changeTargetTriple(Module &);
+  static std::string printType(Type*);
+  static std::string convertInt(int);
+  static void findReturnInst(Function *, std::vector<ReturnInst *> &);
 
   // DFG2LLVM_NVPTX - The first implementation.
   struct DFG2LLVM_NVPTX : public ModulePass {
@@ -64,6 +67,26 @@ namespace {
   };
 
   // Visitor for Code generation traversal (tree traversal for now)
+  class Kernel {
+  public:
+    Kernel(Function* _KF, unsigned _gridDim = 0, std::vector<Value*>
+        _globalWGSize = std::vector<Value*>(), unsigned _blockDim = 0,
+        std::vector<Value*> _localWGSize = std::vector<Value*>()) : KernelFunction(_KF),
+    gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
+    localWGSize(_localWGSize) {
+      assert(gridDim == globalWGSize.size()
+          && "gridDim should be same as the size of vector globalWGSize");
+      assert(blockDim == localWGSize.size()
+          && "blockDim should be same as the size of vector localWGSize");
+    }
+
+    Function* KernelFunction;
+    unsigned gridDim;
+    unsigned blockDim;
+    std::vector<Value*> globalWGSize;
+    std::vector<Value*> localWGSize;
+  };
+
   class CodeGenTraversal : public DFNodeVisitor {
 
   private:
@@ -71,14 +94,8 @@ namespace {
     Module &M;
     Module &KernelM;
     BuildDFG &DFG;
-    DFNode * KernelLaunchNode;
-    struct { Function * KF;
-             unsigned gridDim;
-             unsigned blockDim;
-             std::vector<Value*> localWGSize;
-             std::vector<Value*> globalWGSize;
-           } kernel;
-
+    DFNode* KernelLaunchNode;
+    Kernel* kernel;
     // Map from Old function associated with DFNode to new cloned function with
     // extra index and dimension arguments. This map also serves to find out if
     // we already have an index and dim extended function copy or not (i.e.,
@@ -110,7 +127,7 @@ namespace {
     Argument* getArgumentAt(Function* F, unsigned offset);
     Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
                         Instruction* InsertBefore);
-    void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& Kernel);
+    void insertRuntimeCalls(DFInternalNode* N, const Twine& FileName);
 
     void codeGen(DFInternalNode* N);
     void codeGen(DFLeafNode* N);
@@ -326,11 +343,14 @@ namespace {
   // used to generate a function to associate with this leaf node. The function
   // is responsible for all the memory allocation/transfer and invoking the
   // kernel call on the device
-  void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName, const Twine& KernelName) {
+  void CodeGenTraversal::insertRuntimeCalls(DFInternalNode* N, const Twine& FileName) {
     // Check if clone already exists. If it does, it means we have visited this
-    // function before and nothing else needs to be done for this leaf node.
+    // function before.
     assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
+    // If kernel struct has not been initialized with kernel function, then fail
+    assert(kernel != NULL && "No kernel found!!");
+
     DEBUG(errs() << "Generating kernel call code\n");
 
     Function* F = N->getFuncPointer();
@@ -401,10 +421,12 @@ namespace {
 
     DEBUG(errs() << "Initializing commandQ" << "\n");
     // Initialize command queue
-    Value* file = getStringPointer(FileName, RI, "Filename");
-    Value* kernel = getStringPointer(KernelName, RI,"KernelName");
+    Value* fileStr = getStringPointer(FileName, RI, "Filename");
+    errs() << *fileStr << "\n";
+    errs() << "Generating code for kernel - " << kernel->KernelFunction->getName()<< "\n";
+    Value* kernelStr = getStringPointer(kernel->KernelFunction->getName(), RI,"KernelName");
 
-    Value* LaunchInstArgs[] = {file, kernel};
+    Value* LaunchInstArgs[] = {fileStr, kernelStr};
 
     DEBUG(errs() << "Inserting launch call" << "\n");
     CallInst* GraphID = CallInst::Create(llvm_visc_ptx_launch,
@@ -489,15 +511,46 @@ namespace {
     // Need work dim, localworksize, globalworksize
     // FIXME: Talk to DFG2LLVM_PTX pass to figure out the workdim, loacal work
     // size and global work size
+    // Allocate size_t[numDims] space on stack. Store the work group sizes and
+    // pass it as an argument to ExecNode
+    Type* Int64Ty = Type::getInt64Ty(M.getContext());
+    Type* GlobalWGTy = ArrayType::get(Int64Ty, kernel->gridDim);
+    AllocaInst* GlobalWG = new AllocaInst(GlobalWGTy, "GlobalWGSize", RI);
+    Value* GlobalWGPtr = BitCastInst::CreatePointerCast(GlobalWG, Int64Ty->getPointerTo(), GlobalWG->getName()+".0", RI);
+    Value* nextDim = GlobalWGPtr;
+    errs() << *GlobalWGPtr << "\n";
+    Constant* IntOne = ConstantInt::get(Int64Ty, 1);
+    errs() << *IntOne << "\n";
+    for(unsigned i=0; i < kernel->gridDim; i++) {
+      errs() << *kernel->globalWGSize[i]->getType() << "\n";
+      errs() << *nextDim->getType() << "\n";
+      assert(kernel->globalWGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
+      if(kernel->globalWGSize[i]->getType() != Int64Ty) {
+        kernel->globalWGSize[i] = BitCastInst::CreateIntegerCast(VMap[kernel->globalWGSize[i]], Int64Ty, true, "", RI);
+        StoreInst* SI = new StoreInst(kernel->globalWGSize[i], nextDim, RI);
+        errs() << *SI << "\n";
+      } else {
+        StoreInst* SI = new StoreInst(VMap[kernel->globalWGSize[i]], nextDim, RI);
+        errs() << *SI << "\n";
+      }
+      if(i+1 < kernel->gridDim) {
+        GetElementPtrInst* GEP = GetElementPtrInst::Create(nextDim, ArrayRef<Value*>(IntOne), GlobalWG->getName()+"."+Twine(i+1), RI);
+        errs() << *GEP << "\n";
+        nextDim = GEP;
+      }
+    }
+    errs() << *llvm_visc_ptx_executeNode << "\n";
+    errs() << *GlobalWGPtr << "\n";
     Value* ExecNodeArgs[] = {GraphID,
                             ConstantInt::get(Type::getInt32Ty(M.getContext()), C->getNumOfDim()),
                             Constant::getNullValue(Type::getInt64PtrTy(M.getContext())),
-                            Constant::getNullValue(Type::getInt64PtrTy(M.getContext()))
+                            GlobalWGPtr
                             };
     CallInst* Event = CallInst::Create(llvm_visc_ptx_executeNode,
                                        ArrayRef<Value*>(ExecNodeArgs, 4),
                                        "event."+CF->getName(),
                                        RI);
+    errs() << *Event << "\n";
     // Wait for Kernel to Finish
     CallInst::Create(llvm_visc_ptx_wait,
                      ArrayRef<Value*>(GraphID),
@@ -615,9 +668,9 @@ namespace {
 
       // Now the remaining nodes to be visited should be ignored
       KernelLaunchNode = NULL;
-      writeKernelsModule();
       errs() << "Insert Runtime calls\n";
-      insertRuntimeCalls(N, getKernelsModuleName(M), "matrixMul");
+      insertRuntimeCalls(N, getPTXFilename(M));
+      writeKernelsModule();
 
     } else {
       DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
@@ -643,31 +696,36 @@ namespace {
 
     if (!pLevel || !pReplFactor) {
       KernelLaunchNode = PNode;
+      kernel = new Kernel(NULL, N->getNumOfDim(), N->getDimLimits());
       // TODO: Find a better way of choosing parameters
-      kernel.gridDim = N->getNumOfDim();
-      kernel.blockDim = N->getNumOfDim();
-      kernel.globalWGSize = N->getDimLimits();
-      IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
+      //kernel->gridDim = N->getNumOfDim();
+      //kernel->blockDim = N->getNumOfDim();
+      //kernel->globalWGSize = N->getDimLimits();
+      //kernel->localWGSize = N->getDimLimits();
+      //FIXME: Comment this out as we can provide localWGSize as null
+      //IntegerType* IntTy = Type::getInt32Ty(KernelM.getContext());
       // TODO: How to choose the div factor;
-      ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
-      std::vector<Value*> tmp(kernel.gridDim, divFactor);
-      for (unsigned i = 0; i < kernel.gridDim; i++) {
-        BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel.globalWGSize[i],tmp[i]);
-        kernel.localWGSize.push_back(SDivInst);
-      }
+      //ConstantInt* divFactor = ConstantInt::getSigned(IntTy, (int64_t) 16);
+      //std::vector<Value*> tmp(kernel->gridDim, divFactor);
+      //for (unsigned i = 0; i < kernel->gridDim; i++) {
+      //  BinaryOperator* SDivInst = BinaryOperator::CreateSDiv(kernel->globalWGSize[i],tmp[i]);
+      //  kernel->localWGSize.push_back(SDivInst);
+      //}
     }
     else {
+      errs() << "*************** Entering else part **************\n";
+      /*
       KernelLaunchNode = PNode->getParent();
-      kernel.gridDim = PNode->getNumOfDim();
-      kernel.blockDim = N->getNumOfDim();
+      kernel->gridDim = PNode->getNumOfDim();
+      kernel->blockDim = N->getNumOfDim();
       // TODO: Handle different number of dimensions
-      assert((kernel.gridDim == kernel.blockDim) && "Dimension number must match");
+      assert((kernel->gridDim == kernel->blockDim) && "Dimension number must match");
       std::vector<Value*> numOfBlocks = PNode->getDimLimits();
-      kernel.localWGSize = N->getDimLimits();
-      for (unsigned i = 0; i < kernel.gridDim; i++) {
-        BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel.localWGSize[i],numOfBlocks[i]);
-        kernel.globalWGSize.push_back(MulInst);
-      }
+      kernel->localWGSize = N->getDimLimits();
+      for (unsigned i = 0; i < kernel->gridDim; i++) {
+        //BinaryOperator* MulInst = BinaryOperator::CreateMul(kernel->localWGSize[i],numOfBlocks[i]);
+        //kernel->globalWGSize.push_back(MulInst);
+      }*/
     }
 
     std::vector<IntrinsicInst *> IItoRemove;
@@ -922,7 +980,8 @@ namespace {
       (*ri)->eraseFromParent();
 
     addCLMetadata(F_nvptx);
-    kernel.KF = F_nvptx;
+    kernel->KernelFunction = F_nvptx;
+    errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
     DEBUG(errs() << KernelM);
 
     return;
@@ -945,13 +1004,19 @@ namespace {
     // Initiate code generation for root DFNode
     CGTVisitor->visit(Root);
     //TODO: Edit module epilogue to remove the VISC intrinsic declarations
+    delete CGTVisitor;
 
     return true;
   }
 
   std::string CodeGenTraversal::getKernelsModuleName(Module &M) {
+    /*SmallString<128> currentDir;
+    llvm::sys::fs::current_path(currentDir);
+    std::string fileName = getFilenameFromModule(M);
+    Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+    return output.str().append(".kernels.ll");*/
     std::string mid = M.getModuleIdentifier();
-    return mid.append("_kernels.ll");
+    return mid.append(".kernels.ll");
   }
 
   void CodeGenTraversal::fixValueAddrspace(Value* V, unsigned addrspace) {
@@ -1174,9 +1239,22 @@ namespace {
  *                              Helper functions                              *
  ******************************************************************************/
 
+  // Get generated PTX binary name
+  static std::string getPTXFilename(const Module& M) {
+    std::string moduleID = M.getModuleIdentifier();
+    moduleID.append(".nvptx.s");
+    return moduleID;
+  }
+
+  // Get the name of the input file from module ID
+  static std::string getFilenameFromModule(const Module& M) {
+    std::string moduleID = M.getModuleIdentifier();
+    return moduleID.substr(moduleID.find_last_of("/")+1);
+  }
+
   // Changes the data layout of the Module to be compiled with NVPTX backend
   // TODO: Figure out when to call it, probably after duplicating the modules
-  void changeDataLayout(Module &M) {
+  static void changeDataLayout(Module &M) {
     std::string nvptx32_layoutStr = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
     std::string nvptx64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64";
 
@@ -1189,7 +1267,7 @@ namespace {
     return;
   }
 
-  void changeTargetTriple(Module &M) {
+  static void changeTargetTriple(Module &M) {
     std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
     std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
@@ -1203,7 +1281,7 @@ namespace {
   }
 
 // Helper function, generate a string representation of a type
-  std::string printType(Type* ty) {
+  static std::string printType(Type* ty) {
     std::string type_str;
     raw_string_ostream rso(type_str);
     ty->print(rso);
@@ -1211,14 +1289,14 @@ namespace {
   }
 
 // Helper function, convert int to string
-  std::string convertInt(int number) {
+  static std::string convertInt(int number) {
    std::stringstream ss;//create a stringstream
    ss << number;//add number to the stream
    return ss.str();//return a string with the contents of the stream
   }
 
 // Helper function, populate a vector with all return statements in a function
-  void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
+  static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
     for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
       Instruction *I = &(*i);
       ReturnInst* RI = dyn_cast<ReturnInst>(I);