diff --git a/hpvm/lib/Transforms/CMakeLists.txt b/hpvm/lib/Transforms/CMakeLists.txt
index 8049471a16910a0b33b8e60ebca577d395627222..68724684e56648d307df52624e47ed7393bfd3f9 100644
--- a/hpvm/lib/Transforms/CMakeLists.txt
+++ b/hpvm/lib/Transforms/CMakeLists.txt
@@ -1,7 +1,6 @@
 add_subdirectory(BuildDFG)
 add_subdirectory(ClearDFG)
 add_subdirectory(DFG2LLVM_NVPTX)
-#add_subdirectory(DFG2LLVM_SPIR)
 add_subdirectory(DFG2LLVM_X86)
 add_subdirectory(GenVISC)
 add_subdirectory(LocalMem)
diff --git a/hpvm/lib/Transforms/DFG2LLVM_SPIR/CMakeLists.txt b/hpvm/lib/Transforms/DFG2LLVM_SPIR/CMakeLists.txt
deleted file mode 100644
index 15d3ab08ad185a6df26a02e4c4463936c9b963c5..0000000000000000000000000000000000000000
--- a/hpvm/lib/Transforms/DFG2LLVM_SPIR/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-if(WIN32 OR CYGWIN)
-  set(LLVM_LINK_COMPONENTS Core Support)
-endif()
-
-add_llvm_library( LLVMDFG2LLVM_SPIR
-  MODULE
-  DFG2LLVM_SPIR.cpp
-
-  DEPENDS
-  intrinsics_gen
-  PLUGIN_TOOL
-  opt
-  )
diff --git a/hpvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp b/hpvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
deleted file mode 100644
index e294a94537576ac5ca748af7fd17d70e5a133717..0000000000000000000000000000000000000000
--- a/hpvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.cpp
+++ /dev/null
@@ -1,2010 +0,0 @@
-//=== DFG2LLVM_SPIR.cpp ===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define ENABLE_ASSERTS
-#define TARGET_PTX 32
-#define GENERIC_ADDRSPACE 0
-#define GLOBAL_ADDRSPACE 1
-#define SHARED_ADDRSPACE 3
-
-#define DEBUG_TYPE "DFG2LLVM_SPIR"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/Linker/Linker.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/IR/Attributes.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/DFG2LLVM.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm-c/Core.h"
-
-#include "SupportVISC/VISCUtils.h"
-#include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/IR/UseListOrder.h"
-
-#include <sstream>
-
-using namespace llvm;
-using namespace builddfg;
-using namespace dfg2llvm;
-using namespace viscUtils;
-
-// VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer_SPIR("visc-timers-spir", cl::desc("Enable visc timers"));
-
-namespace {
-// Helper class declarations
-
-// Class to maintain the tuple of host pointer, device pointer and size
-// in bytes. Would have preferred to use tuple but support not yet available
-class OutputPtr {
-public:
-  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
-    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
-
-  Value* h_ptr;
-  Value* d_ptr;
-  Value* bytes;
-};
-
-// Class to maintain important kernel info required for generating runtime
-// calls
-class Kernel {
-public:
-  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
-         std::map<unsigned, unsigned>(),
-         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
-         std::map<unsigned, std::pair<Value*, unsigned> >(),
-         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
-         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
-         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
-    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
-      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
-
-    assert(gridDim == globalWGSize.size()
-           && "gridDim should be same as the size of vector globalWGSize");
-    assert(blockDim == localWGSize.size()
-           && "blockDim should be same as the size of vector localWGSize");
-  }
-
-  Function* KernelFunction;
-  DFLeafNode* KernelLeafNode;
-  std::map<unsigned, unsigned> inArgMap;
-  // Map for shared memory arguments
-  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
-  // Fields for (potential) allocation node
-  DFLeafNode* AllocationNode;
-  Function* AllocationFunction;
-  std::map<unsigned, unsigned> allocInArgMap;  
-
-  std::vector<unsigned> outArgMap;
-  unsigned gridDim;
-  std::vector<Value*> globalWGSize;
-  unsigned blockDim;
-  std::vector<Value*> localWGSize;
-  std::vector<int> localDimMap;
-
-  std::map<unsigned, unsigned> getInArgMap() {
-    return inArgMap;
-  }
-  void setInArgMap(std::map<unsigned, unsigned> map) {
-    inArgMap = map;
-  }
-
-  std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap() {
-    return sharedInArgMap;
-  }
-  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
-    sharedInArgMap = map;
-  }
-
-  std::vector<unsigned> getOutArgMap() {
-    return outArgMap;
-  }
-  void setOutArgMap(std::vector<unsigned> map) {
-    outArgMap = map;
-  }
-
-  void setLocalWGSize(std::vector<Value*> V) {
-    localWGSize = V;
-  }
-
-  bool hasLocalWG() {
-    return blockDim != 0;
-  }
-};
-
-// Helper function declarations
-static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*,
-                                 ValueToValueMapTy&, Instruction*);
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&,
-                              Instruction*, const Twine& WGName = "WGSize");
-static std::string getSPIRFilename(const Module&);
-static std::string getFilenameFromModule(const Module& M);
-static void changeDataLayout(Module &);
-static void changeTargetTriple(Module &);
-static std::string printType(Type*);
-static StringRef getMangledName(std::string);
-static StringRef getAtomicMangledName(std::string, unsigned, bool);
-static void findReturnInst(Function *, std::vector<ReturnInst *> &);
-static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
-static StringRef getAtomicOpName(Intrinsic::ID, unsigned);
-static std::string getMathFunctionName(Intrinsic::ID);
-
-// DFG2LLVM_SPIR - The first implementation.
-struct DFG2LLVM_SPIR : public DFG2LLVM {
-  static char ID; // Pass identification, replacement for typeid
-  DFG2LLVM_SPIR() : DFG2LLVM(ID) {}
-
-private:
-
-public:
-  bool runOnModule(Module &M);
-};
-
-// Visitor for Code generation traversal (tree traversal for now)
-class CGT_SPIR : public CodeGenTraversal {
-
-private:
-  //Member variables
-  std::unique_ptr<Module> KernelM;
-  DFNode* KernelLaunchNode = nullptr;
-  Kernel* kernel;
-
-  // VISC Runtime API
-  Constant* llvm_visc_ocl_launch;
-  Constant* llvm_visc_ocl_wait;
-  Constant* llvm_visc_ocl_initContext;
-  Constant* llvm_visc_ocl_clearContext;
-  Constant* llvm_visc_ocl_argument_shared;
-  Constant* llvm_visc_ocl_argument_scalar;
-  Constant* llvm_visc_ocl_argument_ptr;
-  Constant* llvm_visc_ocl_output_ptr;
-  Constant* llvm_visc_ocl_free;
-  Constant* llvm_visc_ocl_getOutput;
-  Constant* llvm_visc_ocl_executeNode;
-
-  //Functions
-  std::string getKernelsModuleName(Module &M);
-  void fixValueAddrspace(Value* V, unsigned addrspace);
-  Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned i);
-  void removeAttributeAtArguments(Function* F, std::vector<unsigned> &Ags, Attribute::AttrKind attrKind);
-  void addCLMetadata(Function* F);
-  Function* transformFunctionToVoid(Function* F);
-  void removeInOutAttributes(Function* F);
-  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
-
-  // Virtual Functions
-  void init() {
-    VISCTimer = VISCTimer_SPIR;
-    TargetName = "SPIR";
-  }
-  void initRuntimeAPI();
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
-
-public:
-
-  // Constructor
-  CGT_SPIR(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(&_M)) {
-    KernelLaunchNode = NULL;
-    init();
-    initRuntimeAPI();
-    errs() << "Old module pointer: " << &_M << "\n";
-    errs() << "New module pointer: " <<  KernelM.get() << "\n";
-    // Copying instead of creating new, in order to preserve required info (metadata)
-    // Remove functions, global variables and aliases
-    std::vector<GlobalVariable*> gvv = std::vector<GlobalVariable*>();
-    for (Module::global_iterator mi = KernelM->global_begin(),
-         me = KernelM->global_end(); (mi != me); ++mi) {
-      GlobalVariable* gv = &*mi;
-      gvv.push_back(gv);
-    }
-    for (std::vector<GlobalVariable*>::iterator vi = gvv.begin(); vi != gvv.end(); ++vi) {
-      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
-      (*vi)->eraseFromParent();
-    }
-
-    std::vector<Function*> fv = std::vector<Function*>();
-    for (Module::iterator mi = KernelM->begin(),
-         me = KernelM->end(); (mi != me); ++mi) {
-      Function* f = &*mi;
-      fv.push_back(f);
-    }
-    for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) {
-      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
-      (*vi)->eraseFromParent();
-    }
-
-    std::vector<GlobalAlias*> av = std::vector<GlobalAlias*>();
-    for (Module::alias_iterator mi = KernelM->alias_begin(),
-         me = KernelM->alias_end(); (mi != me); ++mi) {
-      GlobalAlias* a = &*mi;
-      av.push_back(a);
-    }
-    for (std::vector<GlobalAlias*>::iterator vi = av.begin(); vi != av.end(); ++vi) {
-      (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
-      (*vi)->eraseFromParent();
-    }
-
-    changeDataLayout(*KernelM);
-    changeTargetTriple(*KernelM);
-
-    DEBUG(errs() << *KernelM);
-
-  }
-
-  void removeLLVMIntrinsics();
-  void writeKernelsModule();
-};
-
-// Initialize the VISC runtime API. This makes it easier to insert these calls
-void CGT_SPIR::initRuntimeAPI() {
-
-  // Load Runtime API Module
-  SMDiagnostic Err;
-
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
-  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
-
-  Twine llvmSrcRoot = LLVM_SRC_ROOT;
-  Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/visc-rt/visc-rt.ll";
-  errs() << "Open file: " << runtimeAPI.str() << "\n";
-  runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-  if(runtimeModule == NULL)
-    DEBUG(errs() << Err.getMessage());
-  else
-    errs() << "Successfully loaded visc-rt API module\n";
-
-  // Get or insert the global declarations for launch/wait functions
-  DECLARE(llvm_visc_ocl_launch);
-  DECLARE(llvm_visc_ocl_wait);
-  DECLARE(llvm_visc_ocl_initContext);
-  DECLARE(llvm_visc_ocl_clearContext);
-  DECLARE(llvm_visc_ocl_argument_shared);
-  DECLARE(llvm_visc_ocl_argument_scalar);
-  DECLARE(llvm_visc_ocl_argument_ptr);
-  DECLARE(llvm_visc_ocl_output_ptr);
-  DECLARE(llvm_visc_ocl_free);
-  DECLARE(llvm_visc_ocl_getOutput);
-  DECLARE(llvm_visc_ocl_executeNode);
-
-  // Get or insert timerAPI functions as well if you plan to use timers
-  initTimerAPI();
-
-  // Insert init context in main
-  DEBUG(errs() << "Gen Code to initialize SPIR Timer\n");
-  Function* VI = M.getFunction("llvm.visc.init");
-  assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
-
-  InitCall = cast<Instruction>(*VI->user_begin());
-  initializeTimerSet(InitCall);
-  switchToTimer(visc_TimerID_INIT_CTX, InitCall);
-  CallInst::Create(llvm_visc_ocl_initContext,
-                  ArrayRef<Value*>(getTargetID(M, visc::SPIR_TARGET)),
-                  "", InitCall);
-  switchToTimer(visc_TimerID_NONE, InitCall);
-
-  // Insert print instruction at visc exit
-  DEBUG(errs() << "Gen Code to print SPIR Timer\n");
-  Function* VC = M.getFunction("llvm.visc.cleanup");
-  DEBUG(errs() << *VC << "\n");
-  assert(VC->getNumUses() == 1 && "__visc__clear should only be used once");
-
-  CleanupCall = cast<Instruction>(*VC->user_begin());
-  printTimerSet(CleanupCall);
-
-
-}
-
-// Generate Code to call the kernel
-// The plan is to replace the internal node with a leaf node. This method is
-// used to generate a function to associate with this leaf node. The function
-// is responsible for all the memory allocation/transfer and invoking the
-// kernel call on the device
-void CGT_SPIR::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
-  // Check if clone already exists. If it does, it means we have visited this
-  // function before.
-//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
-
-  assert(N->getGenFuncForTarget(visc::SPIR_TARGET) == NULL &&
-         "Code already generated for this node");
-
-  // Useful values
-  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
-  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
-
-  // If kernel struct has not been initialized with kernel function, then fail
-  assert(K != NULL && "No kernel found!!");
-
-  DEBUG(errs() << "Generating kernel call code\n");
-
-  Function* F = N->getFuncPointer();
-
-
-  // Create of clone of F with no instructions. Only the type is the same as F
-  // without the extra arguments.
-  Function* F_X86;
-
-  // Clone the function, if we are seeing this function for the first time. We
-  // only need a clone in terms of type.
-  ValueToValueMapTy VMap;
-
-  // Create new function with the same type
-  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
-
-  // Loop over the arguments, copying the names of arguments over.
-  Function::arg_iterator dest_iterator = F_X86->arg_begin();
-  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
-       i != e; ++i) {
-    dest_iterator->setName(i->getName()); // Copy the name over...
-    // Increment dest iterator
-    ++dest_iterator;
-  }
-
-  // Add a basic block to this empty function
-  BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
-  ReturnInst* RI = ReturnInst::Create(M.getContext(),
-                                      UndefValue::get(F_X86->getReturnType()), BB);
-
-  // FIXME: Adding Index and Dim arguments are probably not required except
-  // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
-  // have those arguments)
-
-  // Add Index and Dim arguments except for the root node
-  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
-    F_X86 = addIdxDimArgs(F_X86);
-
-  BB = &*F_X86->begin();
-  RI = cast<ReturnInst>(BB->getTerminator());
-
-  //Add the generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
-  N->addGenFunc(F_X86, visc::SPIR_TARGET, true);
-
-  // Loop over the arguments, to create the VMap
-  dest_iterator = F_X86->arg_begin();
-  for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
-       i != e; ++i) {
-    // Add mapping to VMap and increment dest iterator
-    VMap[&*i] = &*dest_iterator;
-    ++dest_iterator;
-  }
-
-  /* TODO: Use this code to verufy if this is a good pattern for OCL kernel
-
-  // Sort children in topological order before code generation for kernel call
-  N->getChildGraph()->sortChildren();
-
-  // The DFNode N has the property that it has only one child (leaving Entry
-  // and Exit dummy nodes). This child is the OCL kernel. This simplifies code
-  // generation for kernel calls significantly. All the inputs to this child
-  // node would either be constants or from the parent node N.
-
-  assert(N->getChildGraph()->size() == 3
-         && "Node expected to have just one non-dummy node!");
-
-  DFNode* C;
-  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-    C = *ci;
-    // Skip dummy node call
-    if (!C->isDummyNode())
-      break;
-  }
-
-  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
-
-  Function* CF = C->getFuncPointer();
-  */
-  Function* KF = K->KernelLeafNode->getFuncPointer();
-  // Initialize context
-  //DEBUG(errs() << "Initializing context" << "\n");
-  //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
-
-  DEBUG(errs() << "Initializing commandQ" << "\n");
-  // Initialize command queue
-  switchToTimer(visc_TimerID_SETUP, InitCall);
-  Value* fileStr = getStringPointer(FileName, InitCall, "Filename");
-  DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
-  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
-  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName");
-
-  Value* LaunchInstArgs[] = {fileStr, kernelStr};
-
-  DEBUG(errs() << "Inserting launch call" << "\n");
-  CallInst* SPIR_Ctx = CallInst::Create(llvm_visc_ocl_launch,
-                                         ArrayRef<Value*>(LaunchInstArgs, 2),
-                                         "graph"+KF->getName(),
-                                         InitCall);
-  DEBUG(errs() << *SPIR_Ctx << "\n");
-  GraphIDAddr = new GlobalVariable(M,
-                                   SPIR_Ctx->getType(),
-                                   false,
-                                   GlobalValue::CommonLinkage,
-                                   Constant::getNullValue(SPIR_Ctx->getType()),
-                                   "graph"+KF->getName()+".addr");
-  DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
-  StoreInst* SI = new StoreInst(SPIR_Ctx, GraphIDAddr, InitCall);
-  DEBUG(errs() << *SI << "\n");
-  switchToTimer(visc_TimerID_NONE, InitCall);
-  switchToTimer(visc_TimerID_SETUP, RI);
-  Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI);
-
-  // Iterate over the required input edges of the node and use the visc-rt API
-  // to set inputs
-  DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
-  std::vector<OutputPtr> OutputPointers;
-  // Vector to hold the device memory object that need to be cleared before we release
-  // context
-  std::vector<Value*> DevicePointers;
-
-  std::map<unsigned, unsigned> kernelInArgMap = K->getInArgMap();
-/*
-  for(unsigned i=0; i<KF->getFunctionType()->getNumParams(); i++) {
-
-    // The kernel object gives us the mapping of arguments from kernel launch
-    // node function (F_X86) to kernel (kernel->KF)
-    Value* inputVal = getArgumentAt(F_X86, K->getInArgMap()[i]);
-
-*/
-  for(std::map<unsigned, unsigned>::iterator ib = kernelInArgMap.begin(),
-      ie = kernelInArgMap.end(); ib != ie; ++ib) {
-    unsigned i = ib->first;
-    Value* inputVal = getArgumentAt(F_X86, ib->second);
-    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
-
-    // input value has been obtained.
-    // Check if input is a scalar value or a pointer operand
-    // For scalar values such as int, float, etc. the size is simply the size of
-    // type on target machine, but for pointers, the size of data would be the
-    // next integer argument
-    if(inputVal->getType()->isPointerTy()) {
-
-      switchToTimer(visc_TimerID_COPY_PTR, RI);
-      // Pointer Input
-      // CheckAttribute
-      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
-      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
-                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
-
-      Argument* A = getArgumentAt(KF, i);
-      if(isOutput == True) {
-        DEBUG(errs() << *A << " is an OUTPUT argument\n");
-      }
-      if(isInput == True) {
-        DEBUG(errs() << *A << " is an INPUT argument\n");
-      }
-
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
-
-      // Assert that the pointer argument size (next argument) is in the map
-      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
-
-      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
-
-      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
-             && "Pointer type input must always be followed by size (integer type)");
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               inputSize,
-                               isInput,
-                               isOutput
-                              };
-      Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr,
-                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
-      DevicePointers.push_back(d_ptr);
-      // If this has out attribute, store the returned device pointer in
-      // memory to read device memory later
-      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
-    }
-    else {
-      switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-      // Scalar Input
-      // Store the scalar value on stack and then pass the pointer to its
-      // location
-      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), inputVal->getName()+".ptr", RI);
-      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
-
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               ConstantExpr::getSizeOf(inputVal->getType())
-                              };
-      CallInst::Create(llvm_visc_ocl_argument_scalar,
-                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
-    }
-  }
-
-  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
-
-  // Check to see if all the allocation sizes are constant (determined
-  // statically)
-  bool constSizes = true;
-  for (auto& e: K->getSharedInArgMap()) {
-    constSizes &= isa<Constant>(e.second.first);
-  }
-
-  // If the sizes are all constant
-  if (constSizes) {
-    for (auto& e: K->getSharedInArgMap()) {
-      unsigned argNum = e.first;
-      Value* allocSize = e.second.first;
-
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
-
-      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
-        // Shared memory ptr argument - scalar at size position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-
-        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
-
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
-        // Sharem memory size argument - scalar at address position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-        // Store the scalar value on stack and then pass the pointer to its
-        // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
-                                        allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                               Type::getInt8PtrTy(M.getContext()),
-                               allocSize->getName()+".sharedMem.i8ptr",
-                               RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
-      }
-    }
-  } else {
-
-    Function *F_alloc = K->AllocationFunction;
-    StructType *FAllocRetTy = dyn_cast<StructType>(F_alloc->getReturnType());
-    assert(FAllocRetTy && "Allocation node with no struct return type");
-
-    std::vector<Value *> AllocInputArgs;
-    for (unsigned i = 0; i < K->allocInArgMap.size(); i++) {
-      AllocInputArgs.push_back(getArgumentAt(F_X86, K->allocInArgMap.at(i)));
-    }
-
-    CallInst *CI = CallInst::Create(F_alloc, AllocInputArgs, "", RI);
-    std::vector<ExtractValueInst *> ExtractValueInstVec;
-    for (unsigned i = 1; i < FAllocRetTy->getNumElements(); i += 2) {
-      ExtractValueInst *EI = ExtractValueInst::Create(CI, i, "", RI);
-      ExtractValueInstVec.push_back(EI);
-    }
-
-    for (auto& e: K->getSharedInArgMap()) {
-      unsigned argNum = e.first;
-      Value* allocSize = ExtractValueInstVec[e.second.second/2];
-
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
-
-      if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
-        // Shared memory ptr argument - scalar at size position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
-        // Sharem memory size argument - scalar at address position
-        switchToTimer(visc_TimerID_COPY_SCALAR, RI);
-        // Store the scalar value on stack and then pass the pointer to its
-        // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(),
-                                        allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                               Type::getInt8PtrTy(M.getContext()),
-                               allocSize->getName()+".sharedMem.i8ptr",
-                               RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
-        CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
-      }
-    }
-  }
-
-
-  DEBUG(errs() << "Setup output edges of node and insert visc api\n");
-
-  // Set output if struct is not an empty struct
-  StructType* OutputTy = K->KernelLeafNode->getOutputType();
-  std::vector<Value*> d_Outputs;
-  if(!OutputTy->isEmptyTy()) {
-    switchToTimer(visc_TimerID_COPY_PTR, RI);
-    // Not an empty struct
-    // Iterate over all elements of the struct and put them in
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
-      Value* setOutputArgs[] = {GraphID,
-                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
-
-      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
-                                ArrayRef<Value*>(setOutputArgs, 3),
-                                "d_output."+KF->getName(),
-                                RI);
-      d_Outputs.push_back(d_Output);
-    }
-  }
-
-  // Enqueue kernel
-  // Need work dim, localworksize, globalworksize
-  // Allocate size_t[numDims] space on stack. Store the work group sizes and
-  // pass it as an argument to ExecNode
-
-  switchToTimer(visc_TimerID_MISC, RI);
-  Value *workDim, *LocalWGPtr, *GlobalWGPtr;
-  getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
-  switchToTimer(visc_TimerID_KERNEL, RI);
-  Value* ExecNodeArgs[] = {GraphID,
-                           workDim,
-                           LocalWGPtr,
-                           GlobalWGPtr
-                          };
-  CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode,
-                                     ArrayRef<Value*>(ExecNodeArgs, 4),
-                                     "event."+KF->getName(),
-                                     RI);
-  DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
-
-  // Wait for Kernel to Finish
-  CallInst::Create(llvm_visc_ocl_wait,
-                   ArrayRef<Value*>(GraphID),
-                   "",
-                   RI);
-
-  switchToTimer(visc_TimerID_READ_OUTPUT, RI);
-  // Read Output Struct if not empty
-  if(!OutputTy->isEmptyTy()) {
-    std::vector<Value*>h_Outputs;
-    Value* KernelOutput = UndefValue::get(OutputTy);
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      Value* GetOutputArgs[] = {GraphID,
-                              Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                              d_Outputs[i],
-                              ConstantExpr::getSizeOf(OutputTy->getElementType(i))
-                             };
-      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
-                                          ArrayRef<Value*>(GetOutputArgs, 4),
-                                          "h_output."+KF->getName()+".addr",
-                                          RI);
-      // Read each device pointer listed in output struct
-      // Load the output struct
-      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
-          OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
-
-      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
-      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
-                                            KF->getName()+"output", RI);
-    }
-    OutputMap[K->KernelLeafNode] = KernelOutput;
-  }
-
-  // Read all the pointer arguments which had side effects i.e., had out
-  // attribute
-  DEBUG(errs() << "Output Pointers : " << OutputPointers.size() << "\n");
-  // FIXME: Not reading output pointers anymore as we read them when data is
-  // actually requested
-  /*for(auto output: OutputPointers) {
-    DEBUG(errs() << "Read: " << *output.d_ptr << "\n");
-    DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
-    DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
-
-    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
-    CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
-                                    ArrayRef<Value*>(GetOutputArgs, 4),
-                                    "", RI);
-  }*/
-  switchToTimer(visc_TimerID_MEM_FREE, RI);
-  // Clear Context and free device memory
-  DEBUG(errs() << "Clearing context" << "\n");
-  // Free Device Memory
-  for(auto d_ptr: DevicePointers) {
-    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI);
-  }
-  switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall);
-  // Clear Context
-  LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall);
-  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall);
-  switchToTimer(visc_TimerID_NONE, CleanupCall);
-
-  switchToTimer(visc_TimerID_MISC, RI);
-  DEBUG(errs() << "*** Generating epilogue code for the function****\n");
-  // Generate code for output bindings
-  // Get Exit node
-  DFNode* C = N->getChildGraph()->getExit();
-  // Get OutputType of this node
-  StructType* OutTy = N->getOutputType();
-  Value *retVal = UndefValue::get(F_X86->getReturnType());
-  // Find the kernel's output arg map, to use instead of the bindings
-  std::vector<unsigned> outArgMap = kernel->getOutArgMap();
-  // Find all the input edges to exit node
-  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
-    DEBUG(errs() << "Output Edge " << i << "\n");
-    // Find the incoming edge at the requested input port
-    DFEdge* E = C->getInDFEdgeAt(i);
-
-    assert(E && "No Binding for output element!");
-    // Find the Source DFNode associated with the incoming edge
-    DFNode* SrcDF = E->getSourceDF();
-
-    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
-
-    // If Source DFNode is a dummyNode, edge is from parent. Get the
-    // argument from argument list of this internal node
-    Value* inputVal;
-    if(SrcDF->isEntryNode()) {
-      inputVal = getArgumentAt(F_X86, i);
-      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-    }
-    else {
-      // edge is from a internal node
-      // Check - code should already be generated for this source dfnode
-      // FIXME: Since the 2-level kernel code gen has aspecific structure, we
-      // can assume the SrcDF is same as Kernel Leaf node.
-      // Use outArgMap to get correct mapping
-      SrcDF = K->KernelLeafNode;
-      assert(OutputMap.count(SrcDF)
-             && "Source node call not found. Dependency violation!");
-
-      // Find Output Value associated with the Source DFNode using OutputMap
-      Value* CI = OutputMap[SrcDF];
-
-      // Extract element at source position from this call instruction
-      std::vector<unsigned> IndexList;
-      // i is the destination of DFEdge E
-      // Use the mapping instead of the bindings
-//      IndexList.push_back(E->getSourcePosition());
-      IndexList.push_back(outArgMap[i]);
-      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                             "",RI);
-      inputVal = EI;
-    }
-    std::vector<unsigned> IdxList;
-    IdxList.push_back(i);
-    retVal = InsertValueInst::Create(retVal, inputVal, IdxList, "", RI);
-  }
-
-  DEBUG(errs() << "Extracted all\n");
-  switchToTimer(visc_TimerID_NONE, RI);
-  retVal->setName("output");
-  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
-  ReplaceInstWithInst(RI, newRI);
-}
-
-
-// Right now, only targeting the one level case. In general, device functions
-// can return values so we don't need to change them
-void CGT_SPIR::codeGen(DFInternalNode* N) {
-  errs () << "Inside node: " << N->getFuncPointer()->getName() << "\n";
-  if(KernelLaunchNode == NULL)
-    errs () << "No kernel launch node\n";
-  else {
-    errs () << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n";
-  }
-
-
-  if (!KernelLaunchNode) {
-    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
-    return;
-  }
-
-  if (N == KernelLaunchNode) {
-    DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
-    //TODO
-
-    // Now the remaining nodes to be visited should be ignored
-    KernelLaunchNode = NULL;
-    DEBUG(errs() << "Insert Runtime calls\n");
-    insertRuntimeCalls(N, kernel, getSPIRFilename(M));
-
-  } else {
-    DEBUG(errs() << "Found intermediate node. Getting size parameters.\n");
-    // Keep track of the arguments order.
-    std::map<unsigned, unsigned> inmap1 = N->getInArgMap();
-    std::map<unsigned, unsigned> inmap2 = kernel->getInArgMap();
-    // TODO: Structure assumed: one thread node, one allocation node (at most),
-    // TB node
-    std::map<unsigned, unsigned> inmapFinal;
-    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
-         ib != ie; ++ib) {
-      inmapFinal[ib->first] = inmap1[ib->second];
-    }
-
-    kernel->setInArgMap(inmapFinal);
-
-    // Keep track of the output arguments order.
-    std::vector<unsigned> outmap1 = N->getOutArgMap();
-    std::vector<unsigned> outmap2 = kernel->getOutArgMap();
-
-    // TODO: Change when we have incoming edges to the dummy exit node from more
-    // than one nodes. In this case, the number of bindings is the same, but
-    // their destination position, thus the index in outmap1, is not
-    // 0 ... outmap2.size()-1
-    // The limit is the size of outmap2, because this is the number of kernel
-    // output arguments for which the mapping matters
-    // For now, it reasonable to assume that all the kernel arguments are returned,
-    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
-    for (unsigned i = 0; i < outmap2.size(); i++) {
-      outmap1[i] = outmap2[outmap1[i]];
-    }
-    kernel->setOutArgMap(outmap1);
-
-    // Track the source of local dimlimits for the kernel
-    // Dimension limit can either be a constant or an argument of parent
-    // function. Since Internal node would no longer exist, we need to insert the
-    // localWGSize with values from the parent of N.
-    std::vector<Value*> localWGSizeMapped;
-    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-      if (isa<Constant>(kernel->localWGSize[i])) {
-        // if constant, use as it is
-        localWGSizeMapped.push_back(kernel->localWGSize[i]);
-      }
-      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
-        // if argument, find the argument location in N. Use InArgMap of N to
-        // find the source location in Parent of N. Retrieve the argument from
-        // parent to insert in the vector.
-        unsigned argNum = Arg->getArgNo();
-        // This argument will be coming from the parent node, not the allocation
-        // Node
-        assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
-
-        unsigned parentArgNum = N->getInArgMap()[argNum];
-        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
-        localWGSizeMapped.push_back(A);
-      }
-      else {
-        assert(false && "LocalWGsize using value which is neither argument nor constant!");
-      }
-    }
-    // Update localWGSize vector of kernel
-    kernel->setLocalWGSize(localWGSizeMapped);
-  }
-
-}
-
-//static bool checkPreferredTarget(DFNode* N, visc::Target T) {
-  //Function* F = N->getFuncPointer();
-  //Module* M = F->getParent();
-  //NamedMDNode* HintNode;
-  //switch (T) {
-    //case visc::GPU_TARGET:
-      //HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
-      //break;
-    //case visc::SPIR_TARGET:
-      //HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
-      //break;
-    //case visc::CPU_TARGET:
-      //HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
-      //break;
-    //default:
-      //llvm_unreachable("Target Not supported yet!");
-  //}
-  //for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-    //MDNode* MetaNode = HintNode->getOperand(i);
-    //if(F == MetaNode->getOperand(0))
-      //return true;
-  //}
-  //return false;
-//}
-
-void CGT_SPIR::codeGen(DFLeafNode* N) {
-
-  // Skip code generation if it is a dummy node
-  if(N->isDummyNode()) {
-    DEBUG(errs() << "Skipping dummy node\n");
-    return;
-  }
-
-  // Skip code generation if it is an allocation node
-  if(N->isAllocationNode()) {
-    DEBUG(errs() << "Skipping allocation node\n");
-    return;
-  }
-
-  // Generate code only if it has the right hint
-//  if(!checkPreferredTarget(N, visc::SPIR_TARGET)) {
-//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
-//    return;
-//  }
-  if(!preferredTargetIncludes(N, visc::SPIR_TARGET)) {
-    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
-    return;
-  }
-
-  // Checking which node is the kernel launch
-  DFNode* PNode = N->getParent();
-  int pLevel = PNode->getLevel();
-  int pReplFactor = PNode->getNumOfDim();
-
-  // Choose parent node as kernel launch if:
-  // (1) Parent is the top level node i.e., Root of DFG
-  //                    OR
-  // (2) Parent does not have multiple instances
-  errs() << "pLevel = " << pLevel << "\n";
-  errs() << "pReplFactor = " << pReplFactor << "\n";
-
-  if (!pLevel || !pReplFactor) {
-    errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n";
-    KernelLaunchNode = PNode;
-    errs() << "Setting Kernel Launch Node\n";
-    kernel = new Kernel(NULL,
-                        N,
-                        N->getInArgMap(),
-                        N->getSharedInArgMap(),
-                        N->getOutArgMap(),
-                        N->getNumOfDim(),
-                        N->getDimLimits());
-  }
-  else {
-    // Converting a 2-level DFG to opencl kernel
-    errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n";
-    KernelLaunchNode = PNode->getParent();
-    assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
-    // Contains the instructions generating the kernel configuration parameters
-    kernel = new Kernel(NULL,                 // kernel function
-                        N,                    // kernel leaf node
-                        N->getInArgMap(),     // kenel argument mapping
-                        N->getSharedInArgMap(),
-                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
-                        PNode->getNumOfDim(), // gridDim
-                        PNode->getDimLimits(),// grid size
-                        N->getNumOfDim(),     // blockDim
-                        N->getDimLimits());   // block size
-
-  }
-
-  std::vector<IntrinsicInst *> IItoRemove;
-  BuildDFG::HandleToDFNode Leaf_HandleToDFNodeMap;
-
-  // Get the function associated with the dataflow node
-  Function *F = N->getFuncPointer();
-
-  // Look up if we have visited this function before. If we have, then just
-  // get the cloned function pointer from DFNode. Otherwise, create the cloned
-  // function and add it to the DFNode GenFunc.
-  Function *F_spir = N->getGenFuncForTarget(visc::SPIR_TARGET);
-  assert(F_spir == NULL && "Error: Visiting a node for which code already generated");
-
-  // Clone the function
-  ValueToValueMapTy VMap;
-
-  Twine FName = F->getName();
-  F_spir = CloneFunction(F, VMap);
-  F_spir->setName(FName+"_spir");
-  errs() << "Old Function Name: " << F->getName() << "\n";
-  errs() << "New Function Name: " << F_spir->getName() << "\n";
-
-  F_spir->removeFromParent();
-
-  // Insert the cloned function into the kernels module
-  KernelM->getFunctionList().push_back(F_spir);
-
-  //TODO: Iterate over all the instructions of F_spir and identify the
-  //callees and clone them into this module.
-  DEBUG(errs() << *F_spir->getType());
-  DEBUG(errs() << *F_spir);
-
-  //Add generated function info to DFNode
-  //N->setGenFunc(F_spir, visc::SPIR_TARGET);
-
-  F_spir = transformFunctionToVoid(F_spir);
-
-  // Add generated function info to DFNode
-  //N->setGenFunc(F_spir, visc::SPIR_TARGET);
-
-  removeInOutAttributes(F_spir);
-
-  //Add generated function info to DFNode
-  N->addGenFunc(F_spir, visc::SPIR_TARGET, false);
-
-  DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
-  F_spir->removeAttributes(AttributeSet::FunctionIndex, F_spir->getAttributes().getFnAttributes());
-  F_spir->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-
-
-  //FIXME: For now, assume only one allocation node
-  kernel->AllocationNode = NULL;
-
-  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
-       ieb != iee; ++ieb) {
-    DFNode *SrcDFNode = (*ieb)->getSourceDF();
-    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
-    DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
-    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
-    if (!SrcDFNode->isDummyNode()) {
-      assert(SrcDFNode->isAllocationNode());
-      kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
-      kernel->allocInArgMap = SrcDFNode->getInArgMap();
-      break;
-    }
-  }
-
-  // Vector for shared memory arguments
-  std::vector<unsigned> SharedMemArgs;
-
-  // If no allocation node was found, SharedMemArgs is empty
-  if (kernel->AllocationNode) {
-
-    ValueToValueMapTy VMap;
-    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
-    //F_alloc->removeFromParent();
-    // Insert the cloned function into the kernels module
-    //M.getFunctionList().push_back(F_alloc);
-
-    std::vector<IntrinsicInst *> ViscMallocInstVec;
-    findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
-
-    for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
-      IntrinsicInst *II = ViscMallocInstVec[i];
-      assert(II->hasOneUse() && "visc_malloc result is used more than once");
-      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
-      II->eraseFromParent();
-    }
-    kernel->AllocationFunction = F_alloc;
-
-    // This could be used to check that the allocation node has the appropriate
-    // number of fields in its return struct
-/*
-    ReturnInst *RI = ReturnInstVec[0];
-    Value *RetVal = RI->getReturnValue();
-    Type *RetTy = RetVal->getType();
-    StructType *RetStructTy = dyn_cast<StructType>(RetTy);
-    assert(RetStructTy && "Allocation node does not return a struct type");
-    unsigned numFields = RetStructTy->getNumElements();
-*/
-    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
-    AllocationNodeProperty* APN =
-      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
-    for (auto& AllocPair: APN->getAllocationList()) {
-      unsigned destPos = AllocPair.first->getDestPosition();
-      unsigned srcPos = AllocPair.first->getSourcePosition();
-      SharedMemArgs.push_back(destPos);
-      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
-      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
-    }
-    kernel->setSharedInArgMap(sharedInMap);
-  }
-  std::sort(SharedMemArgs.begin(), SharedMemArgs.end());
-
-  // All pointer args which are not shared memory pointers have to be moved to
-  // global address space
-  unsigned argIndex = 0;
-  std::vector<unsigned> GlobalMemArgs;
-  for(auto& Arg: F_spir->getArgumentList()) {
-    if (Arg.getType()->isPointerTy()) {
-      // If the arguement is already chosen for shared memory arguemnt list, skip.
-      // Else put it in Global memory arguement list
-      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
-        GlobalMemArgs.push_back(argIndex);
-      }
-    }
-    argIndex++;
-  }
-  std::sort(GlobalMemArgs.begin(), GlobalMemArgs.end());
-
-  /* At this point, we assume that chescks for the fact that SharedMemArgs only
-     contains pointer arguments to GLOBAL_ADDRSPACE have been performed by the
-     analysis pass */
-
-  F_spir = changeArgAddrspace(F_spir, SharedMemArgs, SHARED_ADDRSPACE);
-  removeAttributeAtArguments(F_spir, SharedMemArgs, Attribute::NoCapture);
-  F_spir = changeArgAddrspace(F_spir, GlobalMemArgs, GLOBAL_ADDRSPACE);
-
-
-  // Go through all the instructions
-  for (inst_iterator i = inst_begin(F_spir), e = inst_end(F_spir); i != e; ++i) {
-    Instruction *I = &(*i);
-    // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
-
-    if (BuildDFG::isViscIntrinsic(I)) {
-      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      IntrinsicInst* ArgII;
-      DFNode* ArgDFNode;
-
-      /************************ Handle VISC Query intrinsics ************************/
-
-      switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode() *****************************/
-      case Intrinsic::visc_getNode: {
-        DEBUG(errs() << F_spir->getName() << "\t: Handling getNode\n");
-        // add mapping <intrinsic, this node> to the node-specific map
-        Leaf_HandleToDFNodeMap[II] = N;
-        IItoRemove.push_back(II);
-      }
-      break;
-      /************************* llvm.visc.getParentNode() **************************/
-      case Intrinsic::visc_getParentNode: {
-        DEBUG(errs() << F_spir->getName() << "\t: Handling getParentNode\n");
-        // get the parent node of the arg node
-        // get argument node
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-        // get the parent node of the arg node
-        // Add mapping <intrinsic, parent node> to the node-specific map
-        // the argument node must have been added to the map, orelse the
-        // code could not refer to it
-        Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
-
-        IItoRemove.push_back(II);
-      }
-      break;
-      /*************************** llvm.visc.getNumDims() ***************************/
-      case Intrinsic::visc_getNumDims: {
-        DEBUG(errs() << F_spir->getName() << "\t: Handling getNumDims\n");
-        // get node from map
-        // get the appropriate field
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-        int numOfDim = ArgDFNode->getNumOfDim();
-        DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
-        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
-        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
-
-        // Replace the result of the intrinsic with the computed value
-        II->replaceAllUsesWith(numOfDimConstant);
-
-        IItoRemove.push_back(II);
-      }
-      break;
-      /*********************** llvm.visc.getNodeInstanceID() ************************/
-      case Intrinsic::visc_getNodeInstanceID_x:
-      case Intrinsic::visc_getNodeInstanceID_y:
-      case Intrinsic::visc_getNodeInstanceID_z: {
-        DEBUG(errs() << F_spir->getName() << "\t: Handling getNodeInstanceID\n");
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-        assert(ArgDFNode && "Arg node is NULL");
-        // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
-        assert(ParentDFNode && "Parent node of a leaf is NULL");
-
-        // Get the number associated with the required dimension
-        // FIXME: The order is important!
-        // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNodeInstanceID_x;
-        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
-        DEBUG(errs() << "\t  dimension = " << dim << "\n");
-
-        // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
-
-        // The following is to find which function to call
-        Function * OpenCLFunction;
-        int parentLevel = N->getParent()->getLevel();
-        int parentReplFactor = N->getParent()->getNumOfDim();
-        DEBUG(errs() << "Parent Level = " << parentLevel << "\n");
-        DEBUG(errs() << "Parent Repl factor = " << parentReplFactor << "\n");
-
-        FunctionType* FT =
-            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),           
-                              ArrayRef<Type*>(Type::getInt32Ty(KernelM->getContext())),
-                              false);
-
-        if ((!parentLevel || !parentReplFactor) && ArgDFNode == N) {
-          // We only have one level in the hierarchy or the parent node is not
-          // replicated. This indicates that the parent node is the kernel
-          // launch, so we need to specify a global id.
-          // We can translate this only if the argument is the current node
-          // itself
-          DEBUG(errs() << "Substitute with get_global_id()\n");
-          DEBUG(errs() << *II << "\n");
-          OpenCLFunction = cast<Function>
-                           (KernelM->getOrInsertFunction(getMangledName("get_global_id"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
-          // We are asking for this node's id with respect to its parent
-          // this is a local id call
-          OpenCLFunction = cast<Function>
-                           (KernelM->getOrInsertFunction(getMangledName("get_local_id"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
-          // We are asking for this node's parent's id with respect to its
-          // parent: this is a group id call
-          OpenCLFunction = cast<Function>
-                           (KernelM->getOrInsertFunction(getMangledName("get_group_id"), FT));
-        } else {
-          errs() << N->getFuncPointer()->getName() << "\n";
-          errs() << N->getParent()->getFuncPointer()->getName() << "\n";
-          errs() << *II << "\n";
-
-          assert(false && "Unable to translate getNodeInstanceID intrinsic");
-        }
-
-        // Create call instruction, insert it before the intrinsic and truncate
-        // the output to 32 bits and replace all the uses of the previous
-        // instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
-        II->replaceAllUsesWith(CI);
-
-        IItoRemove.push_back(II);
-      }
-      break;
-      /********************** llvm.visc.getNumNodeInstances() ***********************/
-      case Intrinsic::visc_getNumNodeInstances_x:
-      case Intrinsic::visc_getNumNodeInstances_y:
-      case Intrinsic::visc_getNumNodeInstances_z: {
-//TODO: think about whether this is the best way to go
-// there are hw specific registers. therefore it is good to have the intrinsic
-// but then, why do we need to keep that info in the graph?
-// (only for the kernel configuration during the call)
-
-        DEBUG(errs() << F_spir->getName() << "\t: Handling getNumNodeInstances\n");
-        ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
-        ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
-        // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
-        assert(ParentDFNode && "Parent node of a leaf is NULL");
-
-        // Get the number associated with the required dimension
-        // FIXME: The order is important!
-        // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNumNodeInstances_x;
-        assert((dim >= 0) && (dim < 3) && "Invalid dimension argument");
-        DEBUG(errs() << "\t  dimension = " << dim << "\n");
-
-        // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
-
-        // The following is to find which function to call
-        Function * OpenCLFunction;
-        int parentLevel = ParentDFNode->getLevel();
-        int parentReplFactor = ParentDFNode->getNumOfDim();
-
-        FunctionType* FT =
-            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                              Type::getInt32Ty(KernelM->getContext()),
-                              false);
-        if ((N == ArgDFNode) && (!parentLevel || !parentReplFactor)) {
-          // We only have one level in the hierarchy or the parent node is not
-          // replicated. This indicates that the parent node is the kernel
-          // launch, so the instances are global_size (gridDim x blockDim)
-          OpenCLFunction = cast<Function>
-                           (KernelM->getOrInsertFunction(getMangledName("get_global_size"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
-          // We are asking for this node's instances
-          // this is a local size (block dim) call
-          OpenCLFunction = cast<Function>
-                           (KernelM->getOrInsertFunction(getMangledName("get_local_size"), FT));
-        } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
-          // We are asking for this node's parent's instances
-          // this is a (global_size/local_size) (grid dim) call
-          OpenCLFunction = cast<Function>
-                           (KernelM->getOrInsertFunction(getMangledName("get_num_groups"), FT));
-        } else {
-          assert(false && "Unable to translate getNumNodeInstances intrinsic");
-        }
-
-        // Create call instruction, insert it before the intrinsic and truncate
-        // the output to 32 bits and replace all the uses of the previous
-        // instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
-        II->replaceAllUsesWith(CI);
-
-        IItoRemove.push_back(II);
-      }
-      break;
-      case Intrinsic::visc_barrier:
-      {
-        DEBUG(errs() << F_spir->getName() << "\t: Handling barrier\n");
-        DEBUG(errs() << "Substitute with barrier()\n");
-        DEBUG(errs() << *II << "\n");
-        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
-                              std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
-                              false);
-        Function* OpenCLFunction = cast<Function>
-                           (KernelM->getOrInsertFunction(getMangledName("barrier"), FT));
-        CallInst* CI = CallInst::Create(OpenCLFunction,
-                               ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
-                               "", II);
-        II->replaceAllUsesWith(CI);
-        IItoRemove.push_back(II); 
-      }
-      break;
-      case Intrinsic::visc_atomic_cmpxchg:
-      case Intrinsic::visc_atomic_add:
-      case Intrinsic::visc_atomic_sub:
-      case Intrinsic::visc_atomic_xchg:
-      case Intrinsic::visc_atomic_min:
-      case Intrinsic::visc_atomic_umin:
-      case Intrinsic::visc_atomic_max:
-      case Intrinsic::visc_atomic_umax:
-      case Intrinsic::visc_atomic_and:
-      case Intrinsic::visc_atomic_or:
-      case Intrinsic::visc_atomic_xor:
-      case Intrinsic::visc_atomic_inc:
-      case Intrinsic::visc_atomic_dec:
-      {
-        DEBUG(errs() << *II << "\n");
-        // Only have support for i32 atomic intrinsics
-        assert(II->getType() == Type::getInt32Ty(II->getContext())
-            && "Only support i32 atomic intrinsics for now");
-        // Substitute with appropriate atomic builtin
-        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
-
-        Value* Ptr = II->getArgOperand(0);
-        Value* Val = II->getArgOperand(1);
-        assert(Ptr->getType()->isPointerTy()
-            && "First argument of supported atomics is expected to be a pointer");
-        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
-        if(PtrTy != Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace())) {
-          Ptr = CastInst::CreatePointerCast(Ptr,
-                                            Type::getInt32PtrTy(II->getContext(),
-                                            PtrTy->getAddressSpace()), "", II);
-        }
-
-        StringRef name = getAtomicOpName(II->getIntrinsicID(), PtrTy->getAddressSpace());
-
-        Type* paramTypes[] = { Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace()),
-                               Type::getInt32Ty(KernelM->getContext())
-                             };
-        FunctionType* AtomicFT = FunctionType::get(II->getType(),
-                                 ArrayRef<Type*>(paramTypes, 2),
-                                 false);
-        Function* AtomicFunction = cast<Function>
-                                   (KernelM->getOrInsertFunction(name, AtomicFT));
-        Value* atomicArgs[] = { Ptr, Val };
-        CallInst* AtomicInst = CallInst::Create(AtomicFunction,
-                                               ArrayRef<Value*>(atomicArgs, 2),
-                                               "", II);
-
-        DEBUG(errs() << "Substitute with: " << *AtomicInst << "\n");
-        II->replaceAllUsesWith(AtomicInst);
-        IItoRemove.push_back(II);
-      }
-      break;
-      default:
-        assert(false && "Unknown VISC Intrinsic!");
-        break;
-      }
-
-    }
-    else if(CallInst* CI = dyn_cast<CallInst>(I)) {
-      DEBUG(errs() << "Found a call: " << *CI << "\n");
-      Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
-      if(calleeF->isDeclaration()) {
-        // Add the declaration to kernel module
-        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-        KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
-        if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(CI)) {
-          // Now handle a few specific intrinsics
-          // For now, sin and cos are translated to their libclc equivalent
-          switch(II->getIntrinsicID()) {
-          case Intrinsic::sin:
-          case Intrinsic::cos:
-          case Intrinsic::sqrt:
-          case Intrinsic::floor:
-          case Intrinsic::nvvm_rsqrt_approx_f:
-          {
-            DEBUG(errs() << "Found math function: " << *II << "\n");
-            // Get the builtin function
-            // SPIR uses mangled name for builtin math functions
-            assert(II->getType()->isFloatTy()
-                   && "Only handling sin(float) and cos(float)!");
-            std::string name = getMathFunctionName(II->getIntrinsicID());
-
-            FunctionType* MathFT = FunctionType::get(II->getType(),
-                                   Type::getFloatTy(KernelM->getContext()),
-                                   false);
-            Function* MathFunction = cast<Function>
-                                     (KernelM->getOrInsertFunction(name, MathFT));
-            CallInst* CI = CallInst::Create(MathFunction, II->getArgOperand(0), II->getName(), II);
-
-            II->replaceAllUsesWith(CI);
-            IItoRemove.push_back(II);
-            break;
-          }
-          default:
-            DEBUG(errs() << "[WARNING] Found Intrinsic: " << *II << "\n" );
-          }
-        }
-
-      }
-      else {
-        // Clone the function
-        ValueToValueMapTy VMap;
-        Function* newCalleeF = CloneFunction(calleeF, VMap);
-        newCalleeF->removeFromParent(); //TODO: MARIA check
-        KernelM->getFunctionList().push_back(newCalleeF);
-      }
-      //TODO: how to handle address space qualifiers in load/store
-    }
-
-  }
-
-  // We need to do this explicitly: DCE pass will not remove them because we
-  // have assumed theworst memory behaviour for these function calls
-  // Traverse the vector backwards, otherwise definitions are deleted while
-  // their subsequent uses are still around
-  for (std::vector<IntrinsicInst *>::reverse_iterator ri = IItoRemove.rbegin(),
-       re = IItoRemove.rend(); ri != re; ++ri)
-    (*ri)->eraseFromParent();
-
-  addCLMetadata(F_spir);
-  kernel->KernelFunction = F_spir;
-  errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n";
-  DEBUG(errs() << *KernelM);
-
-  return;
-}
-
-bool DFG2LLVM_SPIR::runOnModule(Module &M) {
-  errs() << "\nDFG2LLVM_SPIR PASS\n";
-
-  // Get the BuildDFG Analysis Results:
-  // - Dataflow graph
-  // - Maps from i8* hansles to DFNode and DFEdge
-  BuildDFG &DFG = getAnalysis<BuildDFG>();
-
-  // DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
-  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-  //    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
-
-  // Visitor for Code Generation Graph Traversal
-  CGT_SPIR *CGTVisitor = new CGT_SPIR(M, DFG);
-
-  // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode: Roots) {
-    // Initiate code generation for root DFNode
-    CGTVisitor->visit(rootNode);
-  }
-
-  // This is not required. Itrinsics that do not have a use are not a problem
-  //CGTVisitor->removeLLVMIntrinsics();
-  CGTVisitor->writeKernelsModule();
-
-  //TODO: Edit module epilogue to remove the VISC intrinsic declarations
-  delete CGTVisitor;
-
-  return true;
-}
-
-std::string CGT_SPIR::getKernelsModuleName(Module &M) {
-  /*SmallString<128> currentDir;
-  llvm::sys::fs::current_path(currentDir);
-  std::string fileName = getFilenameFromModule(M);
-  Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-  return output.str().append(".kernels.ll");*/
-  std::string mid = M.getModuleIdentifier();
-  return mid.append(".kernels.ll");
-}
-
-void CGT_SPIR::fixValueAddrspace(Value* V, unsigned addrspace) {
-  assert(isa<PointerType>(V->getType())
-         && "Value should be of Pointer Type!");
-  PointerType* OldTy = cast<PointerType>(V->getType());
-  PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-  V->mutateType(NewTy);
-  for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
-    // Change all uses producing pointer type in same address space to new
-    // addressspace.
-    if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
-      if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-        fixValueAddrspace(*ui, addrspace);
-      }
-    }
-  }
-}
-
-Function* CGT_SPIR::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
-  unsigned idx = 0;
-  std::vector<Type*> ArgTypes;
-  for(auto& arg: F->getArgumentList()) {
-    DEBUG(errs() << arg << "\n");
-    unsigned argno = arg.getArgNo();
-    if ((idx < Args.size()) && (argno == Args[idx])) {
-      fixValueAddrspace(&arg, addrspace);
-      idx++;
-    }
-    ArgTypes.push_back(arg.getType());
-  }
-  FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
-
-  //F->mutateType(PTy);
-  Function* newF = cloneFunction(F, newFT, false);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-  DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
-  return newF;
-}
-
-/* Remove the specified argument from arguments at positions denoted in Args */
-void CGT_SPIR::removeAttributeAtArguments(Function* F, std::vector<unsigned> &Args, Attribute::AttrKind attrKind) {
-  DEBUG(errs() << "Removing nocapture attribute from shared memory arguments of function " << F->getName() << "\n");
-
-  unsigned cnt = 0, arg_no = 0;
-  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae && arg_no < Args.size(); ++ai, ++cnt) {
-
-    if (Args[arg_no] == cnt) {
-      AttributeSet AS = F->getAttributes();
-      AttrBuilder AB(AS, ai->getArgNo()+1);
-      AB.removeAttribute(attrKind);
-      AttributeSet argAS = AttributeSet::get(F->getContext(), ai->getArgNo()+1, AB);
-      F->removeAttributes(1+ai->getArgNo(), AS.getParamAttributes(ai->getArgNo() + 1));
-      F->addAttributes(1+ai->getArgNo(), argAS);
-
-      arg_no++;
-    }
-  }
-}
-
-/* Add metadata to module KernelM, for OpenCL kernels */
-void CGT_SPIR::addCLMetadata(Function *F) {
-  // TODO: There is additional metadata used by kernel files but we skip them as
-  // they are not mandatory. In future they might be useful to enable
-  // optimizations
-
-  IRBuilder<> Builder(&*F->begin());
- 
-  // Create node for "kernel_arg_type"
-  SmallVector<Metadata*,8> argTypeNames;
-  argTypeNames.push_back(MDString::get(KernelM->getContext(), "kernel_arg_type"));
-
-  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
-      ai++) {
-    argTypeNames.push_back(MDString::get(KernelM->getContext(), printType(ai->getType())));
-  }
-  // All argument type names are in the vector. Create a metadata node
-  // "kernel_arg_type"
-  MDTuple* KernelArgTypes = MDNode::get(KernelM->getContext(), argTypeNames);
-  
-  // Create kernel metadata node containg the kernel function and the
-  // "kernel_arg_type" metadata node created above
-  SmallVector<Metadata*,8> KernelMD;
-  KernelMD.push_back(ValueAsMetadata::get(F));
-  KernelMD.push_back(KernelArgTypes);
-  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-
-  // Create metadata node opencl.kernels. It points to the kernel metadata node
-  NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
-  MDN_kernels->addOperand(MDKernelNode);
-
-  //KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-  // TODO: Replace 1 with the number of the kernel.
-  // Add when support for multiple launces is added
-  //KernelMD.push_back(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1));
-  //MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-  //NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-  //MDN_annotations->addOperand(MDNvvmAnnotationsNode);
-
-}
-
-/* Function to remove all remaining declarations of llvm intrinsics,
- * as they are not supported in SPIR.
- */
-void CGT_SPIR::removeLLVMIntrinsics() {
-
-  std::vector<Function*> fv = std::vector<Function*>();
-
-  for (Module::iterator mi = KernelM->begin(), me = KernelM->end(); (mi != me); ++mi) {
-    Function* F = &*mi;
-    if (F->isDeclaration() && F->getName().startswith("llvm.")) {
-      DEBUG(errs() << "Declaration: " << F->getName() << " with " << F->getNumUses() <<"uses.\n");
-      assert(F->hasNUses(0) && "LLVM intrinsic function still in use");
-      fv.push_back(F);
-    }
-  }
-
-  for (std::vector<Function*>::iterator vi = fv.begin(); vi != fv.end(); ++vi) {
-    DEBUG(errs() << "Erasing declaration: " << (*vi)->getName() <<"\n");
-    (*vi)->replaceAllUsesWith(UndefValue::get((*vi)->getType()));
-    (*vi)->eraseFromParent();
-  }
-
-}
-
-void CGT_SPIR::writeKernelsModule() {
-
-  // In addition to deleteing all otjer functions, we also want to spice it up a
-  // little bit. Do this now.
-  legacy::PassManager Passes;
-
-  std::error_code EC;
-  tool_output_file Out(getKernelsModuleName(M).c_str(), EC, sys::fs::F_None);
-  if (EC) {
-    errs() << EC.message() << "\n";
-  }
-
-  Passes.add(
-      createPrintModulePass(Out.os()));
-
-  Passes.run(*KernelM);
-
-  // Declare success.
-  Out.keep();
-}
-
-Function* CGT_SPIR::transformFunctionToVoid(Function* F) {
-
-  // FIXME: Maybe do that using the Node?
-  StructType* FRetTy = cast<StructType>(F->getReturnType());
-  assert(FRetTy && "Return Type must always be a struct");
-
-  // Keeps return statements, because we will need to replace them
-  std::vector<ReturnInst *> RItoRemove;
-  findReturnInst(F, RItoRemove);
-
-
-  // Check for { } return struct, which means that the function returns void
-  if (FRetTy->isEmptyTy()) {
-
-    DEBUG(errs() << "\tFunction output struct is void\n");
-    DEBUG(errs() << "\tNo parameters added\n");
-
-    // Replacing return statements with others returning void
-    for (std::vector<ReturnInst *>::iterator i = RItoRemove.begin(),
-         e = RItoRemove.end(); i != e; ++i) {
-      ReturnInst::Create((F->getContext()), 0, (*i));
-      (*i)->eraseFromParent();
-    }
-    DEBUG(errs() << "\tChanged return statements to return void\n");
-  }
-  else {
-    // The struct has return values, thus needs to be converted to parameter
-
-    // Iterate over all element types of return struct and add arguments to the
-    // function
-    std::vector<Argument*> Args;
-    for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
-      Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-      Args.push_back(RetArg);
-      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-    }
-
-    Function::arg_iterator ai, ae;
-
-    DEBUG(errs() << "\tReplacing Return statements\n");
-    // Replace return statements with extractValue and store instructions
-    for (std::vector<ReturnInst *>::iterator rii = RItoRemove.begin(),
-         rie = RItoRemove.end(); rii != rie; ++rii) {
-      ReturnInst* RI = (*rii);
-      Value* RetVal = RI->getReturnValue();
-      for(unsigned i = 0; i < Args.size(); i++) {
-        ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
-                                    Args[i]->getName()+".val", RI);
-        new StoreInst(EI, Args[i], RI);
-      }
-      // assert(RetVal && "Return value should not be null at this point");
-      // StructType* RetType = cast<StructType>(RetVal->getType());
-      // assert(RetType && "Return type is not a struct");
-
-      ReturnInst::Create((F->getContext()), 0, RI);
-      RI->eraseFromParent();
-
-    }
-  }
-  DEBUG(errs() << "\tReplaced return statements\n");
-
-  // Create the argument type list with the added argument's type
-  std::vector<Type*> ArgTypes;
-  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
-    ArgTypes.push_back(ai->getType());
-  }
-
-  // Adding new arguments to the function argument list, would not change the
-  // function type. We need to change the type of this function to reflect the
-  // added arguments
-  Type* VoidRetType = Type::getVoidTy(F->getContext());
-  FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-  // Change the function type
-  //F->mutateType(PTy);
-  Function* newF = cloneFunction(F, newFT, false);
-  replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-  return newF;
-}
-
-// Remove the visc in/out attributes from kernel function
-void CGT_SPIR::removeInOutAttributes(Function* F) {
-  DEBUG(errs() << "Removing visc attributes from argument list of function " << F->getName() << "\n");
-  for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ai++) {
-
-    AttributeSet AS = F->getAttributes();
-    AttrBuilder AB(AS, ai->getArgNo()+1);
-    AB.removeAttribute(Attribute::In);
-    AB.removeAttribute(Attribute::Out);
-    AB.removeAttribute(Attribute::InOut);
-    AttributeSet argAS = AttributeSet::get(F->getContext(), ai->getArgNo()+1, AB);
-    F->removeAttributes(1+ai->getArgNo(), AS.getParamAttributes(ai->getArgNo() + 1));
-    F->addAttributes(1+ai->getArgNo(), argAS);
-
-  }
-}
-
-/******************************************************************************
- *                              Helper functions                              *
- ******************************************************************************/
-
-// Calculate execute node parameters which include, number of diemnsions for
-// dynamic instances of the kernel, local and global work group sizes.
-static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
-                                 &GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
-
-  // Assign number of dimenstions a constant value
-  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-  // If local work group size if null
-  if(!kernel->hasLocalWG()) {
-    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-  }
-  else {
-    for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-      if(isa<Argument>(kernel->localWGSize[i]))
-        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-    }
-    LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-  }
-
-  for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-    if(isa<Argument>(kernel->globalWGSize[i]))
-      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-  }
-
-  // For OpenCL, global work group size is the total bumber of instances in each
-  // dimension. So, multiply local and global dim limits.
-  std::vector<Value*> globalWGSizeInsts;
-  if(kernel->hasLocalWG()) {
-    for (unsigned i = 0; i < kernel->gridDim; i++) {
-      BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
-      globalWGSizeInsts.push_back(MulInst);
-    }
-  }
-  else {
-    globalWGSizeInsts = kernel->globalWGSize;
-  }
-  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
-}
-
-// CodeGen for allocating space for Work Group on stack and returning a pointer
-// to its address
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
-  Value* WGPtr;
-  // Get int64_t and or ease of use
-  Type* Int64Ty = Type::getInt64Ty(M.getContext());
-
-  // Work Group type is [#dim x i64]
-  Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
-  // Allocate space of Global work group data on stack and get pointer to
-  // first element.
-  AllocaInst* WG = new AllocaInst(WGTy, WGName, IB);
-  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
-  Value* nextDim = WGPtr;
-  DEBUG(errs() << *WGPtr << "\n");
-
-  // Iterate over the number of dimensions and store the global work group
-  // size in that dimension
-  for(unsigned i=0; i < WGSize.size(); i++) {
-    assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-    if(WGSize[i]->getType() != Int64Ty) {
-      // If number of dimensions are mentioned in any other integer format,
-      // generate code to extend it to i64. We need to use the mapped value in
-      // the new generated function, hence the use of VMap
-      // FIXME: Why are we changing the kernel WGSize vector here?
-      DEBUG(errs() << "Not i64. Zero extend required.\n");
-      DEBUG(errs() << *WGSize[i] << "\n");
-      CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-      DEBUG(errs() << "Bitcast done.\n");
-      StoreInst* SI = new StoreInst(CI, nextDim, IB);
-      DEBUG(errs() << "Zero extend done.\n");
-      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-    } else {
-      // Store the value representing work group size in ith dimension on
-      // stack
-      StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
-
-      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-    }
-    if(i+1 < WGSize.size()) {
-      // Move to next dimension
-      GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
-                               ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
-                               WG->getName()+"."+Twine(i+1),
-                               IB);
-      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-      nextDim = GEP;
-    }
-  }
-  return WGPtr;
-
-}
-
-//Get generated SPIR binary name
-static std::string getSPIRFilename(const Module& M) {
-  std::string mid = M.getModuleIdentifier();
-  return mid.append(".kernels.bc");
-
-}
-
-// Get the name of the input file from module ID
-static std::string getFilenameFromModule(const Module& M) {
-  std::string moduleID = M.getModuleIdentifier();
-  return moduleID.substr(moduleID.find_last_of("/")+1);
-}
-
-// Changes the data layout of the Module to be compiled with SPIR backend
-// TODO: Figure out when to call it, probably after duplicating the modules
-static void changeDataLayout(Module &M) {
-  std::string spir64_layoutStr = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024";
-    
-  M.setDataLayout(StringRef(spir64_layoutStr));
-  return;
-}
-
-static void changeTargetTriple(Module &M) {
-  std::string spir64_TargetTriple = "spir64-unknown-unknown";
-  M.setTargetTriple(StringRef(spir64_TargetTriple));
-}
-
-// Helper function, generate a string representation of a type
-static std::string printType(Type* ty) {
-  std::string type_str;
-  raw_string_ostream rso(type_str);
-  ty->print(rso);
-  return rso.str();
-}
-
-// Helper function to get mangled names of OpenCL built ins
-static StringRef getMangledName(std::string name) {
-  Twine mangledName = "_Z"+Twine(name.size())+name+"j";
-  return StringRef(mangledName.str());
-}
-
-
-// Helper function, populate a vector with all return statements in a function
-static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-    Instruction *I = &(*i);
-    ReturnInst* RI = dyn_cast<ReturnInst>(I);
-    if (RI) {
-      ReturnInstVec.push_back(RI);
-    }
-  }
-}
-
-// Helper function, populate a vector with all IntrinsicID intrinsics in a function
-static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
-  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-    Instruction *I = &(*i);
-    IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-    if (II && II->getIntrinsicID() == IntrinsicID) {
-      IntrinsicInstVec.push_back(II);
-    }
-  }
-}
-
-// Helper function to get mangled names of OpenCL built ins for atomics
-static StringRef getAtomicMangledName(std::string name, unsigned addrspace, bool sign) {
-  Twine mangledName = "_Z" +
-                      Twine(name.size())+name +
-                      "PU3AS"+Twine(addrspace) + "jj";
-//                      ((sign) ? "ii" : "jj");
-  return StringRef(mangledName.str());
-}
-
-// Helper funtion, returns the OpenCL function name corresponding to atomic op
-static StringRef getAtomicOpName(Intrinsic::ID ID, unsigned addrspace) {
-  switch(ID) {
-    case Intrinsic::visc_atomic_cmpxchg:
-      return getAtomicMangledName("atom_cmpxchg", addrspace, true);
-    case Intrinsic::visc_atomic_add:
-      return getAtomicMangledName("atom_add", addrspace, true);
-    case Intrinsic::visc_atomic_sub:
-      return getAtomicMangledName("atom_sub", addrspace, true);
-    case Intrinsic::visc_atomic_min:
-      return getAtomicMangledName("atom_min", addrspace, true);
-    case Intrinsic::visc_atomic_umin:
-      return getAtomicMangledName("atom_min", addrspace, false);
-    case Intrinsic::visc_atomic_max:
-      return getAtomicMangledName("atom_max", addrspace, true);
-    case Intrinsic::visc_atomic_umax:
-      return getAtomicMangledName("atom_max", addrspace, false);
-    case Intrinsic::visc_atomic_inc:
-      return getAtomicMangledName("atom_inc", addrspace, true);
-    case Intrinsic::visc_atomic_dec:
-      return getAtomicMangledName("atom_dec", addrspace, true);
-    case Intrinsic::visc_atomic_xchg:
-      return getAtomicMangledName("atom_xchg", addrspace, true);
-    case Intrinsic::visc_atomic_and:
-      return getAtomicMangledName("atom_and", addrspace, true);
-    case Intrinsic::visc_atomic_or:
-      return getAtomicMangledName("atom_or", addrspace, true);
-    case Intrinsic::visc_atomic_xor:
-      return getAtomicMangledName("atom_xor", addrspace, true);
-    default:
-      llvm_unreachable("Unsupported atomic intrinsic!");
-  };
-}
-
-static std::string getMathFunctionName(Intrinsic::ID ID) {
-  switch(ID) {
-    case Intrinsic::sin: return "_Z3sinf";
-    case Intrinsic::cos: return "_Z3cosf";
-    case Intrinsic::sqrt: return "_Z4sqrtf";
-    case Intrinsic::floor: return "_Z5floorf";
-    case Intrinsic::nvvm_rsqrt_approx_f: return "_Z5rsqrtf";
-    default:
-      llvm_unreachable("Unsupported math function!");
-  };
-}
-
-} // End of namespace
-
-char DFG2LLVM_SPIR::ID = 0;
-static RegisterPass<DFG2LLVM_SPIR> X("dfg2llvm-spir",
-                                      "Dataflow Graph to LLVM for SPIR Pass",
-                                      false /* does not modify the CFG */,
-                                      true /* transformation,   *
-                                            * not just analysis */);
-
diff --git a/hpvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports b/hpvm/lib/Transforms/DFG2LLVM_SPIR/DFG2LLVM_SPIR.exports
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/hpvm/lib/Transforms/DFG2LLVM_SPIR/LLVMBuild.txt b/hpvm/lib/Transforms/DFG2LLVM_SPIR/LLVMBuild.txt
deleted file mode 100644
index 72c4de9efdc816ca74d54d96f4f66afd467d1639..0000000000000000000000000000000000000000
--- a/hpvm/lib/Transforms/DFG2LLVM_SPIR/LLVMBuild.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-;===- ./lib/Transforms/DFG2LLVM_SPIR/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = DFG2LLVM_SPIR
-parent = Transforms