diff --git a/.hooks/pre-commit b/.hooks/pre-commit
new file mode 100755
index 0000000000000000000000000000000000000000..4204a26e48e1ccbde141ba50d5311a0d2de38186
--- /dev/null
+++ b/.hooks/pre-commit
@@ -0,0 +1,37 @@
+#!/usr/bin/sh
+#
+# Format *.cpp|*.c|*.cc|*.h|*.hpp files with clang-format.
+# Called by "git commit" with no arguments.  The hook should
+# exit with non-zero status after issuing an appropriate message if
+# it wants to stop the commit.
+
+if git rev-parse --verify HEAD >/dev/null 2>&1
+then
+    against=HEAD
+else
+    # Initial commit: diff against an empty tree object
+    against=$(git hash-object -t tree /dev/null)
+fi
+
+# Redirect output to stderr.
+exec 1>&2
+
+# Find all changed C/C++ files
+diff_source_files=$(git diff --cached --name-only --diff-filter=AM $against \
+					-- '*.c' '*.cc' '*.cpp' '*.h' '*.hpp')
+# Only perform clang-format when changed source files exist
+if [[ ! -z $diff_source_files ]]; then
+	echo "[clang-format] Reformatting the following files: "
+	echo $diff_source_files
+	clang-format --style=file -i $diff_source_files
+	echo "[clang-format] Adding reformatted files."
+	git add $diff_source_files
+	# Commit can become empty after this; reject commit in that case.
+	diff_after_format=$(git diff --cached $against)
+	if [[ -z $diff_after_format ]]; then
+		echo "[clang-format] Commit is empty after formatting; rejected."
+		exit 1
+	fi
+else
+	echo "[clang-format] No C/C++ source files modified in the commit."
+fi
diff --git a/hpvm/include/BuildDFG/BuildDFG.h b/hpvm/include/BuildDFG/BuildDFG.h
index d48aa3aa69822bae28a031a438dcd7ecfc9d7748..28230e135beb68c07c998e607fa3d03d40a66791 100644
--- a/hpvm/include/BuildDFG/BuildDFG.h
+++ b/hpvm/include/BuildDFG/BuildDFG.h
@@ -10,13 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/ValueMap.h"
-#include "llvm/IR/Module.h"
+#include "SupportVISC/DFGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
-#include "SupportVISC/DFGraph.h"
+#include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
 
 using namespace llvm;
@@ -27,56 +27,54 @@ struct BuildDFG : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
   BuildDFG() : ModulePass(ID) {}
 
-  typedef ValueMap<Value*, DFNode*> HandleToDFNode;
-  typedef ValueMap<Value*, DFEdge*> HandleToDFEdge;
+  typedef ValueMap<Value *, DFNode *> HandleToDFNode;
+  typedef ValueMap<Value *, DFEdge *> HandleToDFEdge;
 
 private:
   // Member variables
   DFInternalNode *Root;
-  std::vector<DFInternalNode*> Roots;
+  std::vector<DFInternalNode *> Roots;
 
-  HandleToDFNode HandleToDFNodeMap;   // This map associates the i8* pointer
+  HandleToDFNode HandleToDFNodeMap; // This map associates the i8* pointer
   // with the DFNode structure that it
   // represents
-  HandleToDFEdge HandleToDFEdgeMap;   // This map associates the i8* pointer
+  HandleToDFEdge HandleToDFEdgeMap; // This map associates the i8* pointer
   // with the DFEdge structure that it
   // represents
 
-
   // Functions
 public:
-  void handleCreateNode (DFInternalNode* N, IntrinsicInst* II);
+  void handleCreateNode(DFInternalNode *N, IntrinsicInst *II);
+
 private:
-  void handleCreateEdge (DFInternalNode* N, IntrinsicInst* II);
-  void handleGetParentNode (DFInternalNode* N, IntrinsicInst* II);
-  void handleBindInput (DFInternalNode* N, IntrinsicInst* II);
-  void handleBindOutput (DFInternalNode* N, IntrinsicInst* II);
+  void handleCreateEdge(DFInternalNode *N, IntrinsicInst *II);
+  void handleGetParentNode(DFInternalNode *N, IntrinsicInst *II);
+  void handleBindInput(DFInternalNode *N, IntrinsicInst *II);
+  void handleBindOutput(DFInternalNode *N, IntrinsicInst *II);
 
-  void BuildGraph (DFInternalNode* N, Function* F);
+  void BuildGraph(DFInternalNode *N, Function *F);
 
 public:
   // Functions
   virtual bool runOnModule(Module &M);
 
-  static bool isViscLaunchIntrinsic(Instruction * I);
-  static bool isViscGraphIntrinsic(Instruction * I);
-  static bool isViscQueryIntrinsic(Instruction* I);
-  static bool isViscIntrinsic(Instruction* I);
+  static bool isViscLaunchIntrinsic(Instruction *I);
+  static bool isViscGraphIntrinsic(Instruction *I);
+  static bool isViscQueryIntrinsic(Instruction *I);
+  static bool isViscIntrinsic(Instruction *I);
   static bool isTypeCongruent(Type *L, Type *R);
 
-  //TODO: Maybe make these fields const
+  // TODO: Maybe make these fields const
   DFInternalNode *getRoot() const;
-  std::vector<DFInternalNode*> &getRoots();
+  std::vector<DFInternalNode *> &getRoots();
   HandleToDFNode &getHandleToDFNodeMap();
   HandleToDFEdge &getHandleToDFEdgeMap();
-  void addElementToHandleToDFNodeMap(Value* V, DFNode* N);
-  void removeElementFromHandleToDFNodeMap(Value* V);
-  void addElementToHandleToDFEdgeMap(Value* V, DFEdge* E);
-  void removeElementFromHandleToDFEdgeMap(Value* V);
-
+  void addElementToHandleToDFNodeMap(Value *V, DFNode *N);
+  void removeElementFromHandleToDFNodeMap(Value *V);
+  void addElementToHandleToDFEdgeMap(Value *V, DFEdge *E);
+  void removeElementFromHandleToDFEdgeMap(Value *V);
 };
 
-} // End of namespace
+} // namespace builddfg
 
 #endif
-
diff --git a/hpvm/include/GenVISC/GenVISC.h b/hpvm/include/GenVISC/GenVISC.h
index 585af33953ebd28f108a3233b30c6769334230e7..1db9929be70fdc4335e23d7e879248f0ebb45c07 100644
--- a/hpvm/include/GenVISC/GenVISC.h
+++ b/hpvm/include/GenVISC/GenVISC.h
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Module.h"
+#include "SupportVISC/VISCTimer.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "SupportVISC/VISCTimer.h"
 
 using namespace llvm;
 
@@ -24,27 +24,25 @@ struct GenVISC : public ModulePass {
   static char ID; // Pass identification, replacement for typeid
   GenVISC() : ModulePass(ID) {}
 
-
 private:
   // Member variables
-  Module* M;
+  Module *M;
   FunctionCallee llvm_visc_initializeTimerSet;
   FunctionCallee llvm_visc_switchToTimer;
   FunctionCallee llvm_visc_printTimerSet;
 
-  GlobalVariable* TimerSet;
+  GlobalVariable *TimerSet;
 
   // Functions
-  void initializeTimerSet(Instruction*);
-  void switchToTimer(enum visc_TimerID, Instruction*);
-  void printTimerSet(Instruction*);
-  Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = "");
+  void initializeTimerSet(Instruction *);
+  void switchToTimer(enum visc_TimerID, Instruction *);
+  void printTimerSet(Instruction *);
+  Value *getStringPointer(const Twine &S, Instruction *InsertBefore,
+                          const Twine &Name = "");
 
 public:
   // Functions
   virtual bool runOnModule(Module &M);
-
 };
 
-} // End of namespace
-
+} // namespace genvisc
diff --git a/hpvm/include/SupportVISC/DFG2LLVM.h b/hpvm/include/SupportVISC/DFG2LLVM.h
index 841756889fd7a791c78ace8a25c59dce3645e831..b9e4cc4158b71ab18fbeadf2e4d094055feb6149 100644
--- a/hpvm/include/SupportVISC/DFG2LLVM.h
+++ b/hpvm/include/SupportVISC/DFG2LLVM.h
@@ -10,31 +10,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Module.h"
+#include "BuildDFG/BuildDFG.h"
+#include "SupportVISC/VISCHint.h"
+#include "SupportVISC/VISCTimer.h"
+#include "SupportVISC/VISCUtils.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
-#include "BuildDFG/BuildDFG.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/VISCUtils.h"
 
 using namespace llvm;
 using namespace builddfg;
 
-#define TIMER(X) do { if (VISCTimer) { X; } } while (0)
-#define DECLARE(X) X = M.getOrInsertFunction(#X, \
-    runtimeModule->getFunction(#X)->getFunctionType()); \
-    //DEBUG(errs() << *X)
+#define TIMER(X)                                                               \
+  do {                                                                         \
+    if (VISCTimer) {                                                           \
+      X;                                                                       \
+    }                                                                          \
+  } while (0)
+#define DECLARE(X)                                                             \
+  X = M.getOrInsertFunction(                                                   \
+      #X, runtimeModule->getFunction(#X)->getFunctionType());                  \
+  // DEBUG(errs() << *X)
 
 namespace dfg2llvm {
 // Helper Functions
-static inline ConstantInt* getTimerID(Module&, enum visc_TimerID);
-static inline ConstantInt* getTimerID(Module&, enum visc::Target);
+static inline ConstantInt *getTimerID(Module &, enum visc_TimerID);
+static inline ConstantInt *getTimerID(Module &, enum visc::Target);
 
-bool hasAttribute(Function*, unsigned, Attribute::AttrKind);
+bool hasAttribute(Function *, unsigned, Attribute::AttrKind);
 
 // DFG2LLVM abstract class implementation
 class DFG2LLVM : public ModulePass {
@@ -54,24 +60,23 @@ public:
     AU.addRequired<BuildDFG>();
     AU.addPreserved<BuildDFG>();
   }
-
 };
 
 // Abstract Visitor for Code generation traversal (tree traversal for now)
 class CodeGenTraversal : public DFNodeVisitor {
 
 protected:
-  //Member variables
+  // Member variables
   Module &M;
   BuildDFG &DFG;
   bool VISCTimer = false;
   std::string TargetName = "None";
-  
+
   // Map from Old function associated with DFNode to new cloned function with
   // extra index and dimension arguments. This map also serves to find out if
   // we already have an index and dim extended function copy or not (i.e.,
   // "Have we visited this function before?")
-  DenseMap<DFNode*, Value*> OutputMap;
+  DenseMap<DFNode *, Value *> OutputMap;
 
   // VISC Runtime API
   std::unique_ptr<Module> runtimeModule;
@@ -79,103 +84,107 @@ protected:
   FunctionCallee llvm_visc_initializeTimerSet;
   FunctionCallee llvm_visc_switchToTimer;
   FunctionCallee llvm_visc_printTimerSet;
-  GlobalVariable* TimerSet;
-  GlobalVariable* GraphIDAddr;
-  Instruction* InitCall;
-  Instruction* CleanupCall;
-
+  GlobalVariable *TimerSet;
+  GlobalVariable *GraphIDAddr;
+  Instruction *InitCall;
+  Instruction *CleanupCall;
 
   // Functions
-  Value* getStringPointer(const Twine& S, Instruction* InsertBefore, const Twine& Name = "");
-//  void addArgument(Function*, Type*, const Twine& Name = "");
-  Function *addArgument(Function*, Type*, const Twine& Name = "");
-//  void addIdxDimArgs(Function* F);
-  Function *addIdxDimArgs(Function* F);
-  std::vector<Value*> extractElements(Value*, std::vector<Type*>,
-      std::vector<std::string>, Instruction*);
-  Argument* getArgumentAt(Function* F, unsigned offset);
+  Value *getStringPointer(const Twine &S, Instruction *InsertBefore,
+                          const Twine &Name = "");
+  //  void addArgument(Function*, Type*, const Twine& Name = "");
+  Function *addArgument(Function *, Type *, const Twine &Name = "");
+  //  void addIdxDimArgs(Function* F);
+  Function *addIdxDimArgs(Function *F);
+  std::vector<Value *> extractElements(Value *, std::vector<Type *>,
+                                       std::vector<std::string>, Instruction *);
+  Argument *getArgumentAt(Function *F, unsigned offset);
   void initTimerAPI();
 
   // Pure Virtual Functions
   virtual void init() = 0;
   virtual void initRuntimeAPI() = 0;
-  virtual void codeGen(DFInternalNode* N) = 0;
-  virtual void codeGen(DFLeafNode* N) = 0;
+  virtual void codeGen(DFInternalNode *N) = 0;
+  virtual void codeGen(DFLeafNode *N) = 0;
 
   // Virtual Functions
-  virtual void initializeTimerSet(Instruction*);
-  virtual void switchToTimer(enum visc_TimerID, Instruction*);
-  virtual void printTimerSet(Instruction*);
+  virtual void initializeTimerSet(Instruction *);
+  virtual void switchToTimer(enum visc_TimerID, Instruction *);
+  virtual void printTimerSet(Instruction *);
 
   virtual ~CodeGenTraversal() {}
 
-
 public:
-
   // Constructor
   CodeGenTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
 
-  static bool checkPreferredTarget(DFNode* N, visc::Target T);
-  static bool preferredTargetIncludes(DFNode* N, visc::Target T);
+  static bool checkPreferredTarget(DFNode *N, visc::Target T);
+  static bool preferredTargetIncludes(DFNode *N, visc::Target T);
   visc::Target getPreferredTarget(DFNode *N);
 
-  virtual void visit(DFInternalNode* N) {
+  virtual void visit(DFInternalNode *N) {
     // If code has already been generated for this internal node, skip the
     // children
-    if(N->getGenFunc() != NULL)
+    if (N->getGenFunc() != NULL)
       return;
 
-    DEBUG(errs() << "Start: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Start: Generating Code for Node (I) - "
+                 << N->getFuncPointer()->getName() << "\n");
 
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
-    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
-        e = N->getChildGraph()->end(); i != e; ++i) {
-      DFNode* child = *i;
+    for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
+                                    e = N->getChildGraph()->end();
+         i != e; ++i) {
+      DFNode *child = *i;
       child->applyDFNodeVisitor(*this);
     }
     // Generate code for this internal node now. This way all the cloned
     // functions for children exist.
     codeGen(N);
-    DEBUG(errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "DONE: Generating Code for Node (I) - "
+                 << N->getFuncPointer()->getName() << "\n");
   }
 
-  virtual void visit(DFLeafNode* N) {
-    DEBUG(errs() << "Start: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n");
+  virtual void visit(DFLeafNode *N) {
+    DEBUG(errs() << "Start: Generating Code for Node (L) - "
+                 << N->getFuncPointer()->getName() << "\n");
     codeGen(N);
-    DEBUG(errs() << "DONE: Generating Code for Node (L) - " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "DONE: Generating Code for Node (L) - "
+                 << N->getFuncPointer()->getName() << "\n");
   }
 };
 
 // -------------- CodeGenTraversal Implementation -----------------
 
-bool CodeGenTraversal::checkPreferredTarget(DFNode* N, visc::Target T) {
-  Function* F = N->getFuncPointer();
-  Module* M = F->getParent();
-  NamedMDNode* HintNode;
+bool CodeGenTraversal::checkPreferredTarget(DFNode *N, visc::Target T) {
+  Function *F = N->getFuncPointer();
+  Module *M = F->getParent();
+  NamedMDNode *HintNode;
   switch (T) {
-    case visc::GPU_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
-      break;
-    case visc::SPIR_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
-      break;
-    case visc::CUDNN_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
-      break;
-    case visc::PROMISE_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
-      break;
-    case visc::CPU_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
-      break;
-    default:
-      llvm_unreachable("Target Not supported yet!");
+  case visc::GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+    break;
+  case visc::SPIR_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_spir");
+    break;
+  case visc::CUDNN_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_cudnn");
+    break;
+  case visc::PROMISE_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_promise");
+    break;
+  case visc::CPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+    break;
+  default:
+    llvm_unreachable("Target Not supported yet!");
   }
   for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-    MDNode* MetaNode = HintNode->getOperand(i);
-    Value* FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
-    if(F == FHint)
+    MDNode *MetaNode = HintNode->getOperand(i);
+    Value *FHint =
+        dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+    if (F == FHint)
       return true;
   }
   return false;
@@ -185,43 +194,44 @@ visc::Target CodeGenTraversal::getPreferredTarget(DFNode *N) {
   return viscUtils::getPreferredTarget(N->getFuncPointer());
 }
 
-bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) {
+bool CodeGenTraversal::preferredTargetIncludes(DFNode *N, visc::Target T) {
 
-  Function* F = N->getFuncPointer();
-  Module* M = F->getParent();
+  Function *F = N->getFuncPointer();
+  Module *M = F->getParent();
   std::vector<NamedMDNode *> HintNode;
   switch (T) {
-    case visc::GPU_TARGET:
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu"));
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
-      break;
-    case visc::SPIR_TARGET:
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir"));
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
-      break;
-    case visc::CPU_TARGET:
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu"));
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
-      break;
-    case visc::CUDNN_TARGET:
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn"));
-      break;
-    case visc::PROMISE_TARGET:
-      HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise"));
-      break;
-    case visc::CPU_OR_GPU_TARGET:
-    case visc::CPU_OR_SPIR_TARGET:
-      assert(false && "Target should be one of CPU/GPU/SPIR\n");
-      break;
-    default:
-      llvm_unreachable("Target Not supported yet!");
+  case visc::GPU_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_gpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+    break;
+  case visc::SPIR_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_spir"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+    break;
+  case visc::CPU_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_gpu"));
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cpu_spir"));
+    break;
+  case visc::CUDNN_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_cudnn"));
+    break;
+  case visc::PROMISE_TARGET:
+    HintNode.push_back(M->getOrInsertNamedMetadata("visc_hint_promise"));
+    break;
+  case visc::CPU_OR_GPU_TARGET:
+  case visc::CPU_OR_SPIR_TARGET:
+    assert(false && "Target should be one of CPU/GPU/SPIR\n");
+    break;
+  default:
+    llvm_unreachable("Target Not supported yet!");
   }
 
   for (unsigned h = 0; h < HintNode.size(); h++) {
     for (unsigned i = 0; i < HintNode[h]->getNumOperands(); i++) {
       MDNode *MetaNode = HintNode[h]->getOperand(i);
-      Value *FHint = dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
+      Value *FHint =
+          dyn_cast<ValueAsMetadata>(MetaNode->getOperand(0).get())->getValue();
       if (F == FHint)
         return true;
     }
@@ -230,22 +240,25 @@ bool CodeGenTraversal::preferredTargetIncludes(DFNode* N, visc::Target T) {
   return false;
 }
 
-
 // Generate Code for declaring a constant string [L x i8] and return a pointer
 // to the start of it.
-Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
-  Constant* SConstant = ConstantDataArray::getString(M.getContext(), S.str(), true);
-  Value* SGlobal = new GlobalVariable(M, SConstant->getType(), true,
-                                      GlobalValue::InternalLinkage, SConstant, Name);
-  Value* Zero = ConstantInt::get(Type::getInt64Ty(M.getContext()), 0);
-  Value* GEPArgs[] = {Zero, Zero};
-  GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal,
-                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
+Value *CodeGenTraversal::getStringPointer(const Twine &S, Instruction *IB,
+                                          const Twine &Name) {
+  Constant *SConstant =
+      ConstantDataArray::getString(M.getContext(), S.str(), true);
+  Value *SGlobal =
+      new GlobalVariable(M, SConstant->getType(), true,
+                         GlobalValue::InternalLinkage, SConstant, Name);
+  Value *Zero = ConstantInt::get(Type::getInt64Ty(M.getContext()), 0);
+  Value *GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst *SPtr = GetElementPtrInst::Create(
+      nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB);
   return SPtr;
 }
 
 // Add an argument of type Ty to the given function F
-//void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
+// void CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name)
+// {
 //  // Add the argument to argument list
 //  new Argument(Ty, name, F);
 //
@@ -258,14 +271,15 @@ Value* CodeGenTraversal::getStringPointer(const Twine& S, Instruction* IB, const
 //  // Adding new arguments to the function argument list, would not change the
 //  // function type. We need to change the type of this function to reflect the
 //  // added arguments
-//  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
-//  PointerType* PTy = PointerType::get(FTy, cast<PointerType>(F->getType())->getAddressSpace());
+//  FunctionType* FTy = FunctionType::get(F->getReturnType(), ArgTypes,
+//  F->isVarArg()); PointerType* PTy = PointerType::get(FTy,
+//  cast<PointerType>(F->getType())->getAddressSpace());
 //
 //  // Change the function type
 //  F->mutateType(PTy);
 //}
 
-void renameNewArgument(Function *newF, const Twine& argName){
+void renameNewArgument(Function *newF, const Twine &argName) {
   // Get Last argument in Function Arg List and rename it to given name
   Argument *lastArg = &*(newF->arg_end() - 1);
   lastArg->setName(argName);
@@ -273,29 +287,31 @@ void renameNewArgument(Function *newF, const Twine& argName){
 
 // Creates a function with an additional argument of the specified type and
 // name. The previous function is not deleted.
-Function *CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name) {
+Function *CodeGenTraversal::addArgument(Function *F, Type *Ty,
+                                        const Twine &name) {
   Argument *new_arg = new Argument(Ty, name);
 
   // Create the argument type list with added argument types
-  std::vector<Type*> ArgTypes;
-  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
+  std::vector<Type *> ArgTypes;
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
     ArgTypes.push_back(ai->getType());
   }
   ArgTypes.push_back(new_arg->getType());
-  
+
   // Adding new arguments to the function argument list, would not change the
   // function type. We need to change the type of this function to reflect the
   // added arguments. So, we create a clone of this function with the correct
   // type.
-  FunctionType *FTy = FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
+  FunctionType *FTy =
+      FunctionType::get(F->getReturnType(), ArgTypes, F->isVarArg());
   Function *newF = Function::Create(FTy, F->getLinkage(),
-				    F->getName() + "_cloned", F->getParent());
+                                    F->getName() + "_cloned", F->getParent());
   renameNewArgument(newF, name);
   newF = viscUtils::cloneFunction(F, newF, false);
 
   // Check if the function is used by a metadata node
-  if(F->isUsedByMetadata()) {
+  if (F->isUsedByMetadata()) {
     viscUtils::fixHintMetadata(*F->getParent(), F, newF);
   }
 
@@ -303,17 +319,17 @@ Function *CodeGenTraversal::addArgument(Function* F, Type* Ty, const Twine& name
 }
 
 // Change the argument list of function F to add index and limit arguments
-//void CodeGenTraversal::addIdxDimArgs(Function* F) {
+// void CodeGenTraversal::addIdxDimArgs(Function* F) {
 //  // Add Index and Dim arguments
-//  std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
-//  for (int i = 0; i < 6; ++i) {
+//  std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y",
+//  "dim_z"}; for (int i = 0; i < 6; ++i) {
 //    addArgument(F, Type::getInt32Ty(F->getContext()), names[i]);
 //  }
 //}
 
 // Return new function with additional index and limit arguments.
 // The original function is removed from the module and erased.
-Function *CodeGenTraversal::addIdxDimArgs(Function* F) {
+Function *CodeGenTraversal::addIdxDimArgs(Function *F) {
   DEBUG(errs() << "Function Type: " << *F->getFunctionType() << "\n");
   // Add Index and Dim arguments
   std::string names[] = {"idx_x", "idx_y", "idx_z", "dim_x", "dim_y", "dim_z"};
@@ -324,43 +340,42 @@ Function *CodeGenTraversal::addIdxDimArgs(Function* F) {
     F->eraseFromParent();
     F = newF;
   }
-  DEBUG(errs() << "Function Type after adding args: " << *newF->getFunctionType() << "\n");
+  DEBUG(errs() << "Function Type after adding args: "
+               << *newF->getFunctionType() << "\n");
   return newF;
 }
 
 // Extract elements from an aggregate value. TyList contains the type of each
 // element, and names vector contains a name. IB is the instruction before which
 // all the generated code would be inserted.
-std::vector<Value*> CodeGenTraversal::extractElements(Value* Aggregate,
-    std::vector<Type*> TyList, std::vector<std::string> names, Instruction* IB) {
+std::vector<Value *>
+CodeGenTraversal::extractElements(Value *Aggregate, std::vector<Type *> TyList,
+                                  std::vector<std::string> names,
+                                  Instruction *IB) {
   // Extract input data from i8* Aggregate.addr and store them in a vector.
   // For each argument
-  std::vector<Value*> Elements;
-  GetElementPtrInst* GEP;
+  std::vector<Value *> Elements;
+  GetElementPtrInst *GEP;
   unsigned argNum = 0;
-  for(Type* Ty: TyList) {
+  for (Type *Ty : TyList) {
     // BitCast: %arg.addr = bitcast i8* Aggregate.addr to <pointer-to-argType>
-    CastInst* BI = BitCastInst::CreatePointerCast(Aggregate,
-                   Ty->getPointerTo(),
-                   names[argNum]+".addr",
-                   IB);
+    CastInst *BI = BitCastInst::CreatePointerCast(Aggregate, Ty->getPointerTo(),
+                                                  names[argNum] + ".addr", IB);
     // Load: %arg = load <pointer-to-argType> %arg.addr
-    LoadInst* LI = new LoadInst(BI, names[argNum], IB);
+    LoadInst *LI = new LoadInst(BI, names[argNum], IB);
     // Patch argument to call instruction
     Elements.push_back(LI);
-    //errs() << "Pushing element " << *LI << "\n";
-    //CI->setArgOperand(argNum, LI);
+    // errs() << "Pushing element " << *LI << "\n";
+    // CI->setArgOperand(argNum, LI);
 
     // TODO: Minor Optimization - The last GEP statement can/should be left out
     // as no more arguments left
-    // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr, i64 1
-    // This essentially takes us to the next argument in memory
-    Constant* IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1);
-    if (argNum < TyList.size()-1)
-      GEP = GetElementPtrInst::Create(nullptr, BI,
-                                                        ArrayRef<Value*>(IntOne),
-                                                        "nextArg",
-                                                        IB);
+    // Increment using GEP: %nextArg = getelementptr <ptr-to-argType> %arg.addr,
+    // i64 1 This essentially takes us to the next argument in memory
+    Constant *IntOne = ConstantInt::get(Type::getInt64Ty(M.getContext()), 1);
+    if (argNum < TyList.size() - 1)
+      GEP = GetElementPtrInst::Create(nullptr, BI, ArrayRef<Value *>(IntOne),
+                                      "nextArg", IB);
     // Increment argNum and for the next iteration use result of this GEP to
     // extract next argument
     argNum++;
@@ -370,11 +385,11 @@ std::vector<Value*> CodeGenTraversal::extractElements(Value* Aggregate,
 }
 
 // Traverse the function F argument list to get argument at offset
-Argument* CodeGenTraversal::getArgumentAt(Function* F, unsigned offset) {
+Argument *CodeGenTraversal::getArgumentAt(Function *F, unsigned offset) {
   DEBUG(errs() << "Finding argument " << offset << ":\n");
-  assert((F->getFunctionType()->getNumParams() > offset)
-         && "Invalid offset to access arguments!");
-  
+  assert((F->getFunctionType()->getNumParams() > offset) &&
+         "Invalid offset to access arguments!");
+
   Function::arg_iterator ArgIt = F->arg_begin() + offset;
   Argument *Arg = &*ArgIt;
   return Arg;
@@ -388,57 +403,51 @@ void CodeGenTraversal::initTimerAPI() {
 
 // Timer Routines
 // Initialize the timer set
-void CodeGenTraversal::initializeTimerSet(Instruction* InsertBefore) {
- //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n");
-  TIMER(TimerSet = new GlobalVariable(M,
-                                      Type::getInt8PtrTy(M.getContext()),
-                                      false,
-                                      GlobalValue::CommonLinkage,
-                                      Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                                      Twine("viscTimerSet_")+TargetName);
-    DEBUG(errs() << "New global variable: " << *TimerSet << "\n");
-
-    Value* TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
-                                          None,
-                                          "",
-                                          InsertBefore);
-    new StoreInst(TimerSetAddr, TimerSet, InsertBefore);
-  );
+void CodeGenTraversal::initializeTimerSet(Instruction *InsertBefore) {
+  // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet <<
+  // "\n");
+  TIMER(TimerSet = new GlobalVariable(
+            M, Type::getInt8PtrTy(M.getContext()), false,
+            GlobalValue::CommonLinkage,
+            Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+            Twine("viscTimerSet_") + TargetName);
+        DEBUG(errs() << "New global variable: " << *TimerSet << "\n");
+
+        Value *TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
+                                               None, "", InsertBefore);
+        new StoreInst(TimerSetAddr, TimerSet, InsertBefore););
 }
 
-void CodeGenTraversal::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) {
-  Value* switchArgs[] = {TimerSet, getTimerID(M, timer)};
+void CodeGenTraversal::switchToTimer(enum visc_TimerID timer,
+                                     Instruction *InsertBefore) {
+  Value *switchArgs[] = {TimerSet, getTimerID(M, timer)};
   TIMER(CallInst::Create(llvm_visc_switchToTimer,
-                         ArrayRef<Value*>(switchArgs, 2),
-                         "",
-                         InsertBefore));
+                         ArrayRef<Value *>(switchArgs, 2), "", InsertBefore));
 }
 
-void CodeGenTraversal::printTimerSet(Instruction* InsertBefore) {
-  Value* TimerName;
-  TIMER(TimerName = getStringPointer(TargetName+Twine("_Timer"), InsertBefore));
-  Value* printArgs[] = {TimerSet, TimerName};
+void CodeGenTraversal::printTimerSet(Instruction *InsertBefore) {
+  Value *TimerName;
+  TIMER(TimerName =
+            getStringPointer(TargetName + Twine("_Timer"), InsertBefore));
+  Value *printArgs[] = {TimerSet, TimerName};
   TIMER(CallInst::Create(llvm_visc_printTimerSet,
-                         ArrayRef<Value*>(printArgs, 2),
-                         "",
-                         InsertBefore));
+                         ArrayRef<Value *>(printArgs, 2), "", InsertBefore));
 }
 
 // Implementation of Helper Functions
-static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
+static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
 }
 
-static inline ConstantInt* getTargetID(Module& M, enum visc::Target T) {
+static inline ConstantInt *getTargetID(Module &M, enum visc::Target T) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), T);
 }
 
 // Find if argument has the given attribute
-bool hasAttribute(Function* F, unsigned arg_index, Attribute::AttrKind AK) {
-  return F->getAttributes().hasAttribute(arg_index+1, AK);
+bool hasAttribute(Function *F, unsigned arg_index, Attribute::AttrKind AK) {
+  return F->getAttributes().hasAttribute(arg_index + 1, AK);
 }
 
-} // End of namespace
+} // namespace dfg2llvm
 
 #endif
-
diff --git a/hpvm/include/SupportVISC/DFGTreeTraversal.h b/hpvm/include/SupportVISC/DFGTreeTraversal.h
index 095ba1fb88978c3ec71fc1d1ac03e07d05b5f88c..67c317a2e9857b9000e4d77f6858494eb81c1ec1 100644
--- a/hpvm/include/SupportVISC/DFGTreeTraversal.h
+++ b/hpvm/include/SupportVISC/DFGTreeTraversal.h
@@ -1,6 +1,6 @@
 #ifndef __DFGTREETRAVERSAL_H__
 #define __DFGTREETRAVERSAL_H__
-	
+
 //=== DFGTreeTraversal.h - Header file for Tree Traversal of the HPVM DFG ====//
 //
 //                     The LLVM Compiler Infrastructure
@@ -9,56 +9,61 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-	
-#include "llvm/IR/Module.h"
+
+#include "llvm/BuildDFG/BuildDFG.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/BuildDFG/BuildDFG.h"
-	
+
 using namespace llvm;
 using namespace builddfg;
-	
+
 namespace dfg2llvm {
-	
-  class DFGTreeTraversal : public DFNodeVisitor {
-	
-  protected:
-    //Member variables
-    Module &M;
-    BuildDFG &DFG;
-	
-    virtual void process(DFInternalNode* N) = 0;
-    virtual void process(DFLeafNode* N) = 0;
-	
-    virtual ~DFGTreeTraversal() {}
-	
-  public:
-    // Constructor
+
+class DFGTreeTraversal : public DFNodeVisitor {
+
+protected:
+  // Member variables
+  Module &M;
+  BuildDFG &DFG;
+
+  virtual void process(DFInternalNode *N) = 0;
+  virtual void process(DFLeafNode *N) = 0;
+
+  virtual ~DFGTreeTraversal() {}
+
+public:
+  // Constructor
   DFGTreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
-	
-    void visit(DFInternalNode* N) {
-      // May visit a nodemore than once, there is no marking it as visited
-      DEBUG(errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName() << "\n");
-	
-      // Follows a bottom-up approach.
-      for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
-	     e = N->getChildGraph()->end(); i != e; ++i) {
-	DFNode* child = *i;
-	child->applyDFNodeVisitor(*this);
-      }
-	
-      // Process this internal node now.
-      process(N);
-      DEBUG(errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName() << "\n");
-    }
-	
-    void visit(DFLeafNode* N) {
-      DEBUG(errs() << "Start: In Node (L) - " << N->getFuncPointer()->getName() << "\n");
-      process(N);
-      DEBUG(errs() << "DONE: In Node (L) - " << N->getFuncPointer()->getName() << "\n");
+
+  void visit(DFInternalNode *N) {
+    // May visit a nodemore than once, there is no marking it as visited
+    DEBUG(errs() << "Start: In Node (I) - " << N->getFuncPointer()->getName()
+                 << "\n");
+
+    // Follows a bottom-up approach.
+    for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
+                                    e = N->getChildGraph()->end();
+         i != e; ++i) {
+      DFNode *child = *i;
+      child->applyDFNodeVisitor(*this);
     }
-  };
-	
+
+    // Process this internal node now.
+    process(N);
+    DEBUG(errs() << "DONE: In Node (I) - " << N->getFuncPointer()->getName()
+                 << "\n");
+  }
+
+  void visit(DFLeafNode *N) {
+    DEBUG(errs() << "Start: In Node (L) - " << N->getFuncPointer()->getName()
+                 << "\n");
+    process(N);
+    DEBUG(errs() << "DONE: In Node (L) - " << N->getFuncPointer()->getName()
+                 << "\n");
+  }
+};
+
 } // end namespace dfg2llvm
-	
+
 #endif
diff --git a/hpvm/include/SupportVISC/DFGraph.h b/hpvm/include/SupportVISC/DFGraph.h
index 1207f1efc65ef69570425036ff4de7cf5f9cbf0c..0c224a344c4ec342f52f4816280e101518ba43dd 100644
--- a/hpvm/include/SupportVISC/DFGraph.h
+++ b/hpvm/include/SupportVISC/DFGraph.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the definition of the following classes: 
+// This file contains the definition of the following classes:
 // 1. DFNode
 // 2. DFGraph
 // 3. DFInternalNode
@@ -20,17 +20,16 @@
 #ifndef LLVM_IR_DFGRAPH_H
 #define LLVM_IR_DFGRAPH_H
 
+#include "SupportVISC/VISCHint.h"
+#include "SupportVISC/VISCUtils.h"
+#include "llvm/ADT/GraphTraits.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/GraphWriter.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
-
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
@@ -62,57 +61,44 @@ struct TargetGenFuncInfo {
 class DFGraph {
 
 private:
-  typedef std::vector<DFNode*> DFNodeListType;
-  typedef std::vector<DFEdge*> DFEdgeListType;
+  typedef std::vector<DFNode *> DFNodeListType;
+  typedef std::vector<DFEdge *> DFEdgeListType;
 
   // Important things that make up a Dataflow graph
-  DFNode* Entry;                  ///< Dummy node to act as source for edges
-                                  ///< from parent to nodes in the graph
-  DFNode* Exit;                   ///< Dummy node to act as destination for edges
-                                  ///< from nodes in the graph to parent
-  DFInternalNode* Parent;
-  DFNodeListType ChildrenList;    ///< List of children Dataflow Nodes
-  DFEdgeListType DFEdgeList;      ///< List of Dataflow edges among children
-
+  DFNode *Entry; ///< Dummy node to act as source for edges
+                 ///< from parent to nodes in the graph
+  DFNode *Exit;  ///< Dummy node to act as destination for edges
+                 ///< from nodes in the graph to parent
+  DFInternalNode *Parent;
+  DFNodeListType ChildrenList; ///< List of children Dataflow Nodes
+  DFEdgeListType DFEdgeList;   ///< List of Dataflow edges among children
 
 public:
-  DFGraph(DFInternalNode* P);
+  DFGraph(DFInternalNode *P);
 
   virtual ~DFGraph() {}
 
-  void addChildDFNode(DFNode* child) {
-    ChildrenList.push_back(child);
-  }
+  void addChildDFNode(DFNode *child) { ChildrenList.push_back(child); }
 
-  void removeChildDFNode(DFNode* child) {
+  void removeChildDFNode(DFNode *child) {
     children_iterator position = std::find(begin(), end(), child);
     if (position != end()) // the child was found
       ChildrenList.erase(position);
   }
 
   // Dataflow edge connecting child dataflow nodes
-  void addDFEdge(DFEdge* E) {
-    DFEdgeList.push_back(E);
-  }
+  void addDFEdge(DFEdge *E) { DFEdgeList.push_back(E); }
 
-  DFNode* getEntry() const {
-    return Entry;
-  }
+  DFNode *getEntry() const { return Entry; }
 
-  DFNode* getExit() const {
-    return Exit;
-  }
+  DFNode *getExit() const { return Exit; }
 
-  bool isEntry(const DFNode* N) const {
-    return N == Entry;
-  }
+  bool isEntry(const DFNode *N) const { return N == Entry; }
 
-  bool isExit(const DFNode* N) const {
-    return N == Exit;
-  }
+  bool isExit(const DFNode *N) const { return N == Exit; }
 
   void sortChildren();
-  static bool compareRank(DFNode* A, DFNode* B);
+  static bool compareRank(DFNode *A, DFNode *B);
 
   // Iterators
   typedef DFNodeListType::iterator children_iterator;
@@ -124,56 +110,52 @@ public:
   //===--------------------------------------------------------------------===//
   // DFNodeList iterator forwarding functions
   //
-  children_iterator       begin()       { return ChildrenList.begin(); }
+  children_iterator begin() { return ChildrenList.begin(); }
   const_children_iterator begin() const { return ChildrenList.begin(); }
-  children_iterator       end  ()       { return ChildrenList.end();   }
-  const_children_iterator end  () const { return ChildrenList.end();   }
+  children_iterator end() { return ChildrenList.end(); }
+  const_children_iterator end() const { return ChildrenList.end(); }
 
-  size_t                   size() const { return ChildrenList.size();  }
-  bool                    empty() const { return ChildrenList.empty(); }
-  const DFNode           *front() const { return ChildrenList.front(); }
-        DFNode           *front()       { return ChildrenList.front(); }
-  const DFNode            *back() const { return ChildrenList.back();  }
-        DFNode            *back()       { return ChildrenList.back();  }
+  size_t size() const { return ChildrenList.size(); }
+  bool empty() const { return ChildrenList.empty(); }
+  const DFNode *front() const { return ChildrenList.front(); }
+  DFNode *front() { return ChildrenList.front(); }
+  const DFNode *back() const { return ChildrenList.back(); }
+  DFNode *back() { return ChildrenList.back(); }
 
   //===--------------------------------------------------------------------===//
 
   //===--------------------------------------------------------------------===//
   // DFEdgeList iterator forwarding functions
   //
-  dfedge_iterator       dfedge_begin()       { return DFEdgeList.begin(); }
+  dfedge_iterator dfedge_begin() { return DFEdgeList.begin(); }
   const_dfedge_iterator dfedge_begin() const { return DFEdgeList.begin(); }
-  dfedge_iterator       dfedge_end  ()       { return DFEdgeList.end();   }
-  const_dfedge_iterator dfedge_end  () const { return DFEdgeList.end();   }
+  dfedge_iterator dfedge_end() { return DFEdgeList.end(); }
+  const_dfedge_iterator dfedge_end() const { return DFEdgeList.end(); }
 
-  size_t                 dfedge_size() const { return DFEdgeList.size();  }
-  bool                  dfedge_empty() const { return DFEdgeList.empty(); }
-  const DFEdge         *dfedge_front() const { return DFEdgeList.front(); }
-        DFEdge         *dfedge_front()       { return DFEdgeList.front(); }
-  const DFEdge          *dfedge_back() const { return DFEdgeList.back();  }
-        DFEdge          *dfedge_back()       { return DFEdgeList.back();  }
+  size_t dfedge_size() const { return DFEdgeList.size(); }
+  bool dfedge_empty() const { return DFEdgeList.empty(); }
+  const DFEdge *dfedge_front() const { return DFEdgeList.front(); }
+  DFEdge *dfedge_front() { return DFEdgeList.front(); }
+  const DFEdge *dfedge_back() const { return DFEdgeList.back(); }
+  DFEdge *dfedge_back() { return DFEdgeList.back(); }
 
   //===--------------------------------------------------------------------===//
 
-  DFInternalNode* getParent() const {
-    return Parent;
-  }
+  DFInternalNode *getParent() const { return Parent; }
 
   // Child graph is streaming if any of the edges in the edge list is streaming
   bool isStreaming();
 
-
   //**************************************************************************//
   //*                  Functions to modify a dataflow graph                  *//
   //**************************************************************************//
 
   // Delete an edge of the child graph
-  void deleteEdge(DFEdge* E) {
+  void deleteEdge(DFEdge *E) {
     dfedge_iterator position = std::find(dfedge_begin(), dfedge_end(), E);
     if (position != dfedge_end()) // the edge was found
       DFEdgeList.erase(position);
   }
-
 };
 
 // DFNode represents a single VISC Dataflow Node in LLVM.
@@ -190,35 +172,29 @@ class DFNode {
 
 public:
   // Discriminator for LLVM-style RTTI (dyn_cast et al.)
-  enum DFNodeKind {
-    InternalNode,
-    LeafNode
-  };
+  enum DFNodeKind { InternalNode, LeafNode };
 
-  enum PropertyKind {
-    Allocation,
-    NumProperties
-  };
+  enum PropertyKind { Allocation, NumProperties };
 
 private:
-  typedef std::vector<DFNode*> DFNodeListType;
-  typedef std::vector<DFEdge*> DFEdgeListType;
-  typedef void* PropertyType;
+  typedef std::vector<DFNode *> DFNodeListType;
+  typedef std::vector<DFEdge *> DFEdgeListType;
+  typedef void *PropertyType;
   typedef std::map<PropertyKind, PropertyType> PropertyListType;
 
   // Important things that make up a Dataflow Node
-  IntrinsicInst* II;              ///< Associated IntrinsicInst/Value
-  Function* FuncPointer;          ///< Associated Function
-  Function* GenFunc = NULL;       ///< Associated Function generated by backend
+  IntrinsicInst *II;        ///< Associated IntrinsicInst/Value
+  Function *FuncPointer;    ///< Associated Function
+  Function *GenFunc = NULL; ///< Associated Function generated by backend
   struct TargetGenFunctions GenFuncs;
-                                  ///< Associated Functions generated by backends
-                                  ///< (if multiple are available)
+  ///< Associated Functions generated by backends
+  ///< (if multiple are available)
   struct TargetGenFuncInfo GenFuncInfo;
-                                  ///< True for each target generated function
-                                  ///< if the associated genFunc is an x86 function
-  DFInternalNode* Parent;         ///< Pointer to parent dataflow Node
+  ///< True for each target generated function
+  ///< if the associated genFunc is an x86 function
+  DFInternalNode *Parent;         ///< Pointer to parent dataflow Node
   unsigned NumOfDim;              ///< Number of dimensions
-  std::vector<Value*> DimLimits;  ///< Number of instances in each dimension
+  std::vector<Value *> DimLimits; ///< Number of instances in each dimension
   DFNodeListType Successors;      ///< List of successors i.e.,
                                   ///< destination DFNodes to DFEdges
                                   ///< originating from this DFNode
@@ -229,7 +205,7 @@ private:
                                   ///< DFEdges originating from this DFNode to
                                   ///< successor DFNodes
   PropertyListType PropertyList;  ///< List of Properties
-  StructType* OutputType;         ///< Output Type
+  StructType *OutputType;         ///< Output Type
   unsigned Level;                 ///< Distance to the top-level DFNode in the
                                   ///< hierarchy
   unsigned Rank;                  ///< Ordering based on toplogical sort
@@ -255,268 +231,233 @@ public:
   //===--------------------------------------------------------------------===//
   // Successors iterator forwarding functions
   //
-  successor_iterator       successors_begin()        { return Successors.begin(); }
-  const_successor_iterator successors_begin()  const { return Successors.begin(); }
-  successor_iterator       successors_end  ()        { return Successors.end();   }
-  const_successor_iterator successors_end  ()  const { return Successors.end();   }
-
-  size_t                   successors_size()   const { return Successors.size();  }
-  bool                     successors_empty()  const { return Successors.empty(); }
-  const DFNode*            successors_front()  const { return Successors.front(); }
-        DFNode*            successors_front()        { return Successors.front(); }
-  const DFNode*            successors_back()   const { return Successors.back();  }
-        DFNode*            successors_back()         { return Successors.back();  }
+  successor_iterator successors_begin() { return Successors.begin(); }
+  const_successor_iterator successors_begin() const {
+    return Successors.begin();
+  }
+  successor_iterator successors_end() { return Successors.end(); }
+  const_successor_iterator successors_end() const { return Successors.end(); }
+
+  size_t successors_size() const { return Successors.size(); }
+  bool successors_empty() const { return Successors.empty(); }
+  const DFNode *successors_front() const { return Successors.front(); }
+  DFNode *successors_front() { return Successors.front(); }
+  const DFNode *successors_back() const { return Successors.back(); }
+  DFNode *successors_back() { return Successors.back(); }
 
   //===--------------------------------------------------------------------===//
 
   //===--------------------------------------------------------------------===//
   // InDFEdges iterator forwarding functions
   //
-  indfedge_iterator       indfedge_begin()       { return InDFEdges.begin(); }
+  indfedge_iterator indfedge_begin() { return InDFEdges.begin(); }
   const_indfedge_iterator indfedge_begin() const { return InDFEdges.begin(); }
-  indfedge_iterator       indfedge_end  ()       { return InDFEdges.end();   }
-  const_indfedge_iterator indfedge_end  () const { return InDFEdges.end();   }
+  indfedge_iterator indfedge_end() { return InDFEdges.end(); }
+  const_indfedge_iterator indfedge_end() const { return InDFEdges.end(); }
 
-  size_t                   indfedge_size() const { return InDFEdges.size();  }
-  bool                    indfedge_empty() const { return InDFEdges.empty(); }
-  const DFEdge           *indfedge_front() const { return InDFEdges.front(); }
-        DFEdge           *indfedge_front()       { return InDFEdges.front(); }
-  const DFEdge            *indfedge_back() const { return InDFEdges.back();  }
-        DFEdge            *indfedge_back()       { return InDFEdges.back();  }
+  size_t indfedge_size() const { return InDFEdges.size(); }
+  bool indfedge_empty() const { return InDFEdges.empty(); }
+  const DFEdge *indfedge_front() const { return InDFEdges.front(); }
+  DFEdge *indfedge_front() { return InDFEdges.front(); }
+  const DFEdge *indfedge_back() const { return InDFEdges.back(); }
+  DFEdge *indfedge_back() { return InDFEdges.back(); }
 
   //===--------------------------------------------------------------------===//
 
   //===--------------------------------------------------------------------===//
   // OutDFEdges iterator forwarding functions
   //
-  outdfedge_iterator       outdfedge_begin()       { return OutDFEdges.begin(); }
-  const_outdfedge_iterator outdfedge_begin() const { return OutDFEdges.begin(); }
-  outdfedge_iterator       outdfedge_end  ()       { return OutDFEdges.end();   }
-  const_outdfedge_iterator outdfedge_end  () const { return OutDFEdges.end();   }
-
-  size_t                    outdfedge_size() const { return OutDFEdges.size();  }
-  bool                     outdfedge_empty() const { return OutDFEdges.empty(); }
-  const DFEdge            *outdfedge_front() const { return OutDFEdges.front(); }
-        DFEdge            *outdfedge_front()       { return OutDFEdges.front(); }
-  const DFEdge             *outdfedge_back() const { return OutDFEdges.back();  }
-        DFEdge             *outdfedge_back()       { return OutDFEdges.back();  }
+  outdfedge_iterator outdfedge_begin() { return OutDFEdges.begin(); }
+  const_outdfedge_iterator outdfedge_begin() const {
+    return OutDFEdges.begin();
+  }
+  outdfedge_iterator outdfedge_end() { return OutDFEdges.end(); }
+  const_outdfedge_iterator outdfedge_end() const { return OutDFEdges.end(); }
+
+  size_t outdfedge_size() const { return OutDFEdges.size(); }
+  bool outdfedge_empty() const { return OutDFEdges.empty(); }
+  const DFEdge *outdfedge_front() const { return OutDFEdges.front(); }
+  DFEdge *outdfedge_front() { return OutDFEdges.front(); }
+  const DFEdge *outdfedge_back() const { return OutDFEdges.back(); }
+  DFEdge *outdfedge_back() { return OutDFEdges.back(); }
 
   //===--------------------------------------------------------------------===//
 
   // Functions
 
-  DFNodeKind getKind() const {
-    return Kind;
-  }
-  
-  DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint,
-      DFInternalNode* _Parent, unsigned _NumOfDim, std::vector<Value*>
-      _DimLimits, DFNodeKind _K);
+  DFNodeKind getKind() const { return Kind; }
+
+  DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
+         DFInternalNode *_Parent, unsigned _NumOfDim,
+         std::vector<Value *> _DimLimits, DFNodeKind _K);
 
   bool isRoot() const {
     // It is a root node is it was created from a launch intrinsic
-    if(II->getCalledFunction()->getName().equals("llvm.visc.launch")) {
-    	assert(Level == 0 && "Root node's level is zero.");
-        return true;
+    if (II->getCalledFunction()->getName().equals("llvm.visc.launch")) {
+      assert(Level == 0 && "Root node's level is zero.");
+      return true;
     }
     return false;
   }
 
-  StructType* getOutputType() const {
-    return OutputType;
-  }
+  StructType *getOutputType() const { return OutputType; }
 
-  void addSuccessor(DFNode* N) {
-    Successors.push_back(N);
-  }
+  void addSuccessor(DFNode *N) { Successors.push_back(N); }
 
   // Add incoming dataflow edge
-  void addInDFEdge(DFEdge* E) {
-    InDFEdges.push_back(E);
-  }
+  void addInDFEdge(DFEdge *E) { InDFEdges.push_back(E); }
 
   // Add outgoing dataflow edge
-  void addOutDFEdge(DFEdge* E) {
-    OutDFEdges.push_back(E);
-  }
+  void addOutDFEdge(DFEdge *E) { OutDFEdges.push_back(E); }
 
-  Function* getFuncPointer() const {
-    return FuncPointer;
-  }
+  Function *getFuncPointer() const { return FuncPointer; }
 
-  void setFuncPointer(Function* _FuncPointer) {
-    FuncPointer = _FuncPointer;
-  }
+  void setFuncPointer(Function *_FuncPointer) { FuncPointer = _FuncPointer; }
 
-  IntrinsicInst* getInstruction() const {
-    return II;
-  }
+  IntrinsicInst *getInstruction() const { return II; }
 
-  DFInternalNode* getParent() const {
-    return Parent;
-  }
+  DFInternalNode *getParent() const { return Parent; }
 
-  unsigned getNumOfDim() const {
-    return NumOfDim;
-  }
+  unsigned getNumOfDim() const { return NumOfDim; }
 
-  std::vector<Value*> getDimLimits() const {
-    return DimLimits;
-  }
+  std::vector<Value *> getDimLimits() const { return DimLimits; }
 
-  unsigned getLevel() const {
-    return Level;
-  }
+  unsigned getLevel() const { return Level; }
 
-  unsigned getRank() const {
-    return Rank;
-  }
+  unsigned getRank() const { return Rank; }
 
-  void setTag(visc::Target T) {
-    Tag = T; 
-  }
+  void setTag(visc::Target T) { Tag = T; }
 
-  visc::Target getTag() const {
-    return Tag; 
-  }
+  visc::Target getTag() const { return Tag; }
 
-  void* getProperty(PropertyKind PType) {
-    assert(PropertyList.count(PType) == 1
-        && "Requesting a property not defined!");
+  void *getProperty(PropertyKind PType) {
+    assert(PropertyList.count(PType) == 1 &&
+           "Requesting a property not defined!");
     return PropertyList[PType];
   }
 
-  void setProperty(PropertyKind PType, void* PValue) {
-    assert(PropertyList.count(PType) == 0
-        && "Inserting a property already defined!");
+  void setProperty(PropertyKind PType, void *PValue) {
+    assert(PropertyList.count(PType) == 0 &&
+           "Inserting a property already defined!");
     PropertyList[PType] = PValue;
   }
 
-  void setGenFunc(Function* F, visc::Target T) {
+  void setGenFunc(Function *F, visc::Target T) {
     GenFunc = F;
     Tag = T;
   }
 
-  Function* getGenFunc() const {
-    return GenFunc;
-  }
+  Function *getGenFunc() const { return GenFunc; }
 
   void setHasX86FuncForTarget(visc::Target T, bool isX86Func) {
     switch (T) {
-      case visc::None:
-	return; // Do nothing.
-      case visc::CPU_TARGET:
-        GenFuncInfo.cpu_hasX86Func = isX86Func;
-        break;
-      case visc::GPU_TARGET:
-        GenFuncInfo.gpu_hasX86Func = isX86Func;
-        break;
-      case visc::CPU_OR_GPU_TARGET:
-      	break;
-      default:
-       assert(false && "Unknown target\n");
-        break;
+    case visc::None:
+      return; // Do nothing.
+    case visc::CPU_TARGET:
+      GenFuncInfo.cpu_hasX86Func = isX86Func;
+      break;
+    case visc::GPU_TARGET:
+      GenFuncInfo.gpu_hasX86Func = isX86Func;
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+      break;
+    default:
+      assert(false && "Unknown target\n");
+      break;
     }
-    return; 
+    return;
   }
 
   bool hasX86GenFuncForTarget(visc::Target T) const {
     switch (T) {
-      case visc::None:
-	return false;
-      case visc::CPU_TARGET:
-        return GenFuncInfo.cpu_hasX86Func;
-      case visc::GPU_TARGET:
-        return GenFuncInfo.gpu_hasX86Func;
-      case visc::CPU_OR_GPU_TARGET:
-        assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n");
-      default:
-       assert(false && "Unknown target\n");
+    case visc::None:
+      return false;
+    case visc::CPU_TARGET:
+      return GenFuncInfo.cpu_hasX86Func;
+    case visc::GPU_TARGET:
+      return GenFuncInfo.gpu_hasX86Func;
+    case visc::CPU_OR_GPU_TARGET:
+      assert(false && "Single target expected (CPU/GPU/SPIR/CUDNN/PROMISE)\n");
+    default:
+      assert(false && "Unknown target\n");
     }
-   return false;
+    return false;
   }
 
-  void addGenFunc(Function* F, visc::Target T, bool isX86Func) {
+  void addGenFunc(Function *F, visc::Target T, bool isX86Func) {
 
     switch (T) {
-      case visc::CPU_TARGET:
-        if (GenFuncs.CPUGenFunc != NULL) {
-          DEBUG(errs() << "Warning: Second generated CPU function for node "
-                       << FuncPointer->getName() << "\n");
-        }
-        GenFuncs.CPUGenFunc = F;
-        GenFuncInfo.cpu_hasX86Func = isX86Func;
-        break;
-      case visc::GPU_TARGET:
-        if (GenFuncs.GPUGenFunc != NULL) {
-          DEBUG(errs() << "Warning: Second generated GPU function for node "
-                       << FuncPointer->getName() << "\n");
-        }
-        GenFuncs.GPUGenFunc = F;
-        GenFuncInfo.gpu_hasX86Func = isX86Func;
-        break;
-      case visc::CPU_OR_GPU_TARGET:
-        assert(false &&
-               "A node function should be set with a tag specifying its \
+    case visc::CPU_TARGET:
+      if (GenFuncs.CPUGenFunc != NULL) {
+        DEBUG(errs() << "Warning: Second generated CPU function for node "
+                     << FuncPointer->getName() << "\n");
+      }
+      GenFuncs.CPUGenFunc = F;
+      GenFuncInfo.cpu_hasX86Func = isX86Func;
+      break;
+    case visc::GPU_TARGET:
+      if (GenFuncs.GPUGenFunc != NULL) {
+        DEBUG(errs() << "Warning: Second generated GPU function for node "
+                     << FuncPointer->getName() << "\n");
+      }
+      GenFuncs.GPUGenFunc = F;
+      GenFuncInfo.gpu_hasX86Func = isX86Func;
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+      assert(false && "A node function should be set with a tag specifying its \
                 type, not the node hint itself\n");
-      default:
-        assert(false && "Unknown target for generated function\n");
+    default:
+      assert(false && "Unknown target for generated function\n");
     }
 
-    Tag = viscUtils::getUpdatedTag(Tag,T);
+    Tag = viscUtils::getUpdatedTag(Tag, T);
   }
 
-  Function* getGenFuncForTarget(visc::Target T)  const {
+  Function *getGenFuncForTarget(visc::Target T) const {
     switch (T) {
-      case visc::None:
-	return NULL;
-      case visc::CPU_TARGET:
-        return GenFuncs.CPUGenFunc;
-      case visc::GPU_TARGET:
-        return GenFuncs.GPUGenFunc;
-      case visc::CPU_OR_GPU_TARGET:
-        assert(false &&
-               "Requesting genarated node function with dual tag instead of \
+    case visc::None:
+      return NULL;
+    case visc::CPU_TARGET:
+      return GenFuncs.CPUGenFunc;
+    case visc::GPU_TARGET:
+      return GenFuncs.GPUGenFunc;
+    case visc::CPU_OR_GPU_TARGET:
+      assert(false &&
+             "Requesting genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
-      default:
-        assert(false && "Unknown target for generated function\n");
+    default:
+      assert(false && "Unknown target for generated function\n");
     }
     return NULL;
   }
 
   void removeGenFuncForTarget(visc::Target T) {
     switch (T) {
-      case visc::None:
-	return;
-      case visc::CPU_TARGET:
-        GenFuncs.CPUGenFunc = NULL;
-        GenFuncInfo.cpu_hasX86Func = false;
-        break;
-      case visc::GPU_TARGET:
-        GenFuncs.GPUGenFunc = NULL;
-        GenFuncInfo.gpu_hasX86Func = false;
-        break;
-      case visc::CPU_OR_GPU_TARGET:
-        assert(false &&
-               "Removing genarated node function with dual tag instead of \
+    case visc::None:
+      return;
+    case visc::CPU_TARGET:
+      GenFuncs.CPUGenFunc = NULL;
+      GenFuncInfo.cpu_hasX86Func = false;
+      break;
+    case visc::GPU_TARGET:
+      GenFuncs.GPUGenFunc = NULL;
+      GenFuncInfo.gpu_hasX86Func = false;
+      break;
+    case visc::CPU_OR_GPU_TARGET:
+      assert(false &&
+             "Removing genarated node function with dual tag instead of \
                 CPU/GPU/SPIR/CUDNN/PROMISE\n");
-      default:
-        assert(false && "Unknown target for generated function\n");
+    default:
+      assert(false && "Unknown target for generated function\n");
     }
     return;
   }
 
-  void setTargetHint(visc::Target T) {
-    Hint = T;
-  }
+  void setTargetHint(visc::Target T) { Hint = T; }
 
-  visc::Target getTargetHint() const {
-    return Hint;
-  }
+  visc::Target getTargetHint() const { return Hint; }
 
-  bool isDummyNode() const {
-    return isEntryNode() || isExitNode();
-  }
+  bool isDummyNode() const { return isEntryNode() || isExitNode(); }
 
   bool isAllocationNode() {
     // If Allocation Property is defined then it is not an allocation node
@@ -525,18 +466,18 @@ public:
   void setRank(unsigned r);
   bool isEntryNode() const;
   bool isExitNode() const;
-  DFEdge* getInDFEdgeAt(unsigned inPort);
-  DFEdge* getExtendedInDFEdgeAt(unsigned inPort);
-  DFEdge* getOutDFEdgeAt(unsigned outPort);
-  DFEdge* getExtendedOutDFEdgeAt(unsigned outPort);
+  DFEdge *getInDFEdgeAt(unsigned inPort);
+  DFEdge *getExtendedInDFEdgeAt(unsigned inPort);
+  DFEdge *getOutDFEdgeAt(unsigned outPort);
+  DFEdge *getExtendedOutDFEdgeAt(unsigned outPort);
   std::map<unsigned, unsigned> getInArgMap();
-  std::map<unsigned, std::pair<Value*, unsigned> > getSharedInArgMap();
+  std::map<unsigned, std::pair<Value *, unsigned>> getSharedInArgMap();
   std::vector<unsigned> getOutArgMap();
-  int getAncestorHops(DFNode* N);
+  int getAncestorHops(DFNode *N);
   bool hasSideEffects();
 
   virtual void applyDFNodeVisitor(DFNodeVisitor &V) = 0;
-//  virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0;
+  //  virtual void applyDFEdgeVisitor(DFEdgeVisitor &V) = 0;
 
   void clearGraphElements() {
     Successors.clear();
@@ -544,7 +485,6 @@ public:
     OutDFEdges.clear();
     Parent = NULL;
   }
-
 };
 
 /*****************************************************
@@ -553,49 +493,43 @@ public:
 class DFInternalNode : public DFNode {
 
 private:
-  DFGraph* childGraph;            ///< Pointer to dataflow graph
+  DFGraph *childGraph; ///< Pointer to dataflow graph
 
   // Constructor
-  DFInternalNode(IntrinsicInst* II, Function* FuncPointer, visc::Target Hint,
-      DFInternalNode* Parent, int NumOfDim, std::vector<Value*> DimLimits) :
-    DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, InternalNode) {
+  DFInternalNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+                 DFInternalNode *Parent, int NumOfDim,
+                 std::vector<Value *> DimLimits)
+      : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits,
+               InternalNode) {
 
     childGraph = new DFGraph(this);
   }
-public:
 
-  static DFInternalNode *Create(IntrinsicInst* II, Function* FuncPointer,
-      visc::Target Hint = visc::CPU_TARGET, DFInternalNode* Parent = NULL, int
-      NumOfDim = 0, std::vector<Value*> DimLimits = std::vector<Value*>()) {
+public:
+  static DFInternalNode *
+  Create(IntrinsicInst *II, Function *FuncPointer,
+         visc::Target Hint = visc::CPU_TARGET, DFInternalNode *Parent = NULL,
+         int NumOfDim = 0,
+         std::vector<Value *> DimLimits = std::vector<Value *>()) {
 
     return new DFInternalNode(II, FuncPointer, Hint, Parent, NumOfDim,
-        DimLimits);
+                              DimLimits);
   }
 
-  static bool classof(const DFNode *N) {
-    return N->getKind() == InternalNode;
-  }
+  static bool classof(const DFNode *N) { return N->getKind() == InternalNode; }
 
-  void addChildToDFGraph(DFNode* N) {
-    childGraph->addChildDFNode(N);
-  }
+  void addChildToDFGraph(DFNode *N) { childGraph->addChildDFNode(N); }
 
-  void removeChildFromDFGraph(DFNode* N) {
-    childGraph->removeChildDFNode(N);
-  }
+  void removeChildFromDFGraph(DFNode *N) { childGraph->removeChildDFNode(N); }
 
-  void addEdgeToDFGraph(DFEdge* E);
- 
-  DFGraph* getChildGraph() const {
-    return childGraph;
-  }
+  void addEdgeToDFGraph(DFEdge *E);
 
-  bool isChildGraphStreaming() {
-    return childGraph->isStreaming();
-  }
+  DFGraph *getChildGraph() const { return childGraph; }
+
+  bool isChildGraphStreaming() { return childGraph->isStreaming(); }
 
   void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/
-//  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
+  //  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
 };
 
 /*****************************************************
@@ -605,26 +539,23 @@ class DFLeafNode : public DFNode {
 
 private:
   // Constructor
-  DFLeafNode(IntrinsicInst* II, Function* FuncPointer, visc::Target Hint,
-      DFInternalNode* Parent, int NumOfDim = 0, std::vector<Value*> DimLimits =
-      std::vector<Value*>()) : DFNode(II, FuncPointer, Hint, Parent, NumOfDim,
-      DimLimits, LeafNode) {}
+  DFLeafNode(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+             DFInternalNode *Parent, int NumOfDim = 0,
+             std::vector<Value *> DimLimits = std::vector<Value *>())
+      : DFNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits, LeafNode) {}
 
 public:
-
-  static DFLeafNode *Create(IntrinsicInst* II, Function* FuncPointer, visc::Target Hint,
-                            DFInternalNode* Parent, int NumOfDim = 0,
-                            std::vector<Value*> DimLimits = std::vector<Value*>()) {
+  static DFLeafNode *
+  Create(IntrinsicInst *II, Function *FuncPointer, visc::Target Hint,
+         DFInternalNode *Parent, int NumOfDim = 0,
+         std::vector<Value *> DimLimits = std::vector<Value *>()) {
     return new DFLeafNode(II, FuncPointer, Hint, Parent, NumOfDim, DimLimits);
   }
 
-  static bool classof(const DFNode *N) {
-    return N->getKind() == LeafNode;
-  }
+  static bool classof(const DFNode *N) { return N->getKind() == LeafNode; }
 
   void applyDFNodeVisitor(DFNodeVisitor &V); /*virtual*/
-//  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
-
+  //  void applyDFEdgeVisitor(DFEdgeVisitor &V); /*virtual*/
 };
 
 // DFEdge represents a single VISC Dataflow Edge in LLVM.
@@ -646,135 +577,110 @@ public:
 class DFEdge {
 private:
   // Important things that make up a Dataflow Edge
-  DFNode* SrcDF;                ///< Pointer to source dataflow Node
-  DFNode* DestDF;               ///< Pointer to destination dataflow Node
-  bool EdgeType;                ///< ONE_TO_ONE or ALL_TO_ALL
-  unsigned SourcePosition;      ///< Position of data in the output of source
-                                ///< DFnode
-  unsigned DestPosition;        ///< Position of data in the input of
-                                ///< destination DFnode
-  Type* ArgType;                ///< Type of the argument
-  bool isStreaming;             ///< Is this an streaming edge
+  DFNode *SrcDF;           ///< Pointer to source dataflow Node
+  DFNode *DestDF;          ///< Pointer to destination dataflow Node
+  bool EdgeType;           ///< ONE_TO_ONE or ALL_TO_ALL
+  unsigned SourcePosition; ///< Position of data in the output of source
+                           ///< DFnode
+  unsigned DestPosition;   ///< Position of data in the input of
+                           ///< destination DFnode
+  Type *ArgType;           ///< Type of the argument
+  bool isStreaming;        ///< Is this an streaming edge
 
   // Functions
-  DFEdge(DFNode* _SrcDF, DFNode* _DestDF, bool _EdgeType,
-         unsigned _SourcePosition, unsigned _DestPosition, Type* _ArgType, bool _isStreaming)
-       : SrcDF(_SrcDF), DestDF(_DestDF), EdgeType(_EdgeType),
-         SourcePosition(_SourcePosition), DestPosition(_DestPosition),
-         ArgType(_ArgType), isStreaming(_isStreaming) {}
+  DFEdge(DFNode *_SrcDF, DFNode *_DestDF, bool _EdgeType,
+         unsigned _SourcePosition, unsigned _DestPosition, Type *_ArgType,
+         bool _isStreaming)
+      : SrcDF(_SrcDF), DestDF(_DestDF), EdgeType(_EdgeType),
+        SourcePosition(_SourcePosition), DestPosition(_DestPosition),
+        ArgType(_ArgType), isStreaming(_isStreaming) {}
 
 public:
-  //TODO: Decide whether we need this type
-//  typedef enum {ONE_TO_ONE = false, ALL_TO_ALL} DFEdgeType;
+  // TODO: Decide whether we need this type
+  //  typedef enum {ONE_TO_ONE = false, ALL_TO_ALL} DFEdgeType;
 
-  static DFEdge *Create(DFNode* SrcDF, DFNode* DestDF, bool EdgeType,
-                        unsigned SourcePosition, unsigned DestPosition, Type*
-                        ArgType, bool isStreaming = false) {
+  static DFEdge *Create(DFNode *SrcDF, DFNode *DestDF, bool EdgeType,
+                        unsigned SourcePosition, unsigned DestPosition,
+                        Type *ArgType, bool isStreaming = false) {
     return new DFEdge(SrcDF, DestDF, EdgeType, SourcePosition, DestPosition,
                       ArgType, isStreaming);
-
   }
 
-  DFNode* getSourceDF() const {
-    return SrcDF;
-  }
+  DFNode *getSourceDF() const { return SrcDF; }
 
-  void setSourceDF(DFNode* N) {
-    SrcDF = N;
-  }
+  void setSourceDF(DFNode *N) { SrcDF = N; }
 
-  DFNode* getDestDF() const {
-    return DestDF;
-  }
+  DFNode *getDestDF() const { return DestDF; }
 
-  void setDestDF(DFNode* N) {
-    DestDF = N;
-  }
+  void setDestDF(DFNode *N) { DestDF = N; }
 
-  bool getEdgeType() const {
-    return EdgeType;
-  }
+  bool getEdgeType() const { return EdgeType; }
 
-  unsigned getSourcePosition() const {
-    return SourcePosition;
-  }
+  unsigned getSourcePosition() const { return SourcePosition; }
 
-  void setSourcePosition(unsigned i) {
-    SourcePosition = i;
-  }
+  void setSourcePosition(unsigned i) { SourcePosition = i; }
 
-  unsigned getDestPosition() const {
-    return DestPosition;
-  }
+  unsigned getDestPosition() const { return DestPosition; }
 
-  void setDestPosition(unsigned i) {
-    DestPosition = i;
-  }
+  void setDestPosition(unsigned i) { DestPosition = i; }
 
-  Type* getType() const {
-    return ArgType;
-  }
-
-  bool isStreamingEdge() const {
-    return isStreaming;
-  }
+  Type *getType() const { return ArgType; }
 
+  bool isStreamingEdge() const { return isStreaming; }
 };
 
-
 //===--------------------- DFGraph Outlined Functions --------------===//
-DFGraph::DFGraph(DFInternalNode* P) {
+DFGraph::DFGraph(DFInternalNode *P) {
   Parent = P;
   // Create dummy entry and exit nodes and add them to the graph
-  Entry = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent);
+  Entry =
+      DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent);
   Exit = DFLeafNode::Create(NULL, Parent->getFuncPointer(), visc::None, Parent);
   addChildDFNode(Entry);
   addChildDFNode(Exit);
 }
 
-void DFGraph::sortChildren() {
-  std::sort(begin(), end(), compareRank);
-}
+void DFGraph::sortChildren() { std::sort(begin(), end(), compareRank); }
 
-bool DFGraph::compareRank(DFNode* A, DFNode* B) {
+bool DFGraph::compareRank(DFNode *A, DFNode *B) {
   return A->getRank() < B->getRank();
 }
 
 bool DFGraph::isStreaming() {
-  for (auto E: DFEdgeList) {
-    if(E->isStreamingEdge())
+  for (auto E : DFEdgeList) {
+    if (E->isStreamingEdge())
       return true;
   }
   return false;
 }
 
 //===--------------------- DFNode Outlined Functions --------------===//
-DFNode::DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint,
-    DFInternalNode* _Parent, unsigned _NumOfDim, std::vector<Value*> _DimLimits,
-    DFNodeKind _K): II(_II), FuncPointer(_FuncPointer), Parent(_Parent),
-  NumOfDim(_NumOfDim), DimLimits(_DimLimits), Kind(_K) {
+DFNode::DFNode(IntrinsicInst *_II, Function *_FuncPointer, visc::Target _Hint,
+               DFInternalNode *_Parent, unsigned _NumOfDim,
+               std::vector<Value *> _DimLimits, DFNodeKind _K)
+    : II(_II), FuncPointer(_FuncPointer), Parent(_Parent), NumOfDim(_NumOfDim),
+      DimLimits(_DimLimits), Kind(_K) {
 
-  Type* Ty = FuncPointer->getFunctionType()->getReturnType();
+  Type *Ty = FuncPointer->getFunctionType()->getReturnType();
 
   // Allow the return type to be void too, in the hVISC IR. If return type is
   // void, create an empty struct type and keep that as the return type of the
   // node.
-  if(Ty->isVoidTy())
+  if (Ty->isVoidTy())
     Ty = StructType::get(Ty->getContext(), true);
 
   // All nodes output type must always be a struct type.
-  assert(isa<StructType>(Ty)
-    && "Invalid return type of a dataflow node");
+  assert(isa<StructType>(Ty) && "Invalid return type of a dataflow node");
 
   // Check that the number of dimensions is correct
   assert(NumOfDim <= 3 && "Invalid num of dimensions for dataflow node!");
 
   // Check that the number of dimensions is correct
-  assert(DimLimits.size() == NumOfDim
-    && "Incompatible num of dimensions and dimension limits for DFNode!");
+  assert(DimLimits.size() == NumOfDim &&
+         "Incompatible num of dimensions and dimension limits for DFNode!");
 
   OutputType = cast<StructType>(Ty);
-  Level = (_Parent) ? _Parent->getLevel() + 1 : 0 ;
+  Level = (_Parent) ? _Parent->getLevel() + 1 : 0;
   Rank = 0;
 
   Tag = visc::None;
@@ -794,12 +700,12 @@ DFNode::DFNode(IntrinsicInst* _II, Function* _FuncPointer, visc::Target _Hint,
 void DFNode::setRank(unsigned r) {
   Rank = r;
   // Update rank of successors
-  for(outdfedge_iterator i = outdfedge_begin(),
-      e = outdfedge_end(); i != e; ++i) {
-    DFEdge* E = *i;
-    DFNode* D = E->getDestDF();
-    if(D->getRank() <= r)
-      D->setRank(r+1);
+  for (outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e;
+       ++i) {
+    DFEdge *E = *i;
+    DFNode *D = E->getDestDF();
+    if (D->getRank() <= r)
+      D->setRank(r + 1);
   }
 }
 
@@ -815,33 +721,35 @@ bool DFNode::isExitNode() const {
   return Parent->getChildGraph()->isExit(this);
 }
 
-DFEdge* DFNode::getInDFEdgeAt(unsigned inPort) {
+DFEdge *DFNode::getInDFEdgeAt(unsigned inPort) {
 
   // If it is not a dummy node, then check if inPort should be less than the
   // number of arguments in the associated function.
-  assert((inPort < FuncPointer->getFunctionType()->getNumParams()
-    || isDummyNode()) && "Invalid input port request!");
-
-  for(indfedge_iterator i = indfedge_begin(), e = indfedge_end(); i != e; ++i) {
-    DFEdge* E = *i;
-    if(inPort == E->getDestPosition())
+  assert((inPort < FuncPointer->getFunctionType()->getNumParams() ||
+          isDummyNode()) &&
+         "Invalid input port request!");
+
+  for (indfedge_iterator i = indfedge_begin(), e = indfedge_end(); i != e;
+       ++i) {
+    DFEdge *E = *i;
+    if (inPort == E->getDestPosition())
       return E;
   }
   return NULL;
 }
 
-DFEdge* DFNode::getExtendedInDFEdgeAt(unsigned inPort) {
-  DFEdge* Ein = getInDFEdgeAt(inPort);
-  DFNode* sn = Ein->getSourceDF();
+DFEdge *DFNode::getExtendedInDFEdgeAt(unsigned inPort) {
+  DFEdge *Ein = getInDFEdgeAt(inPort);
+  DFNode *sn = Ein->getSourceDF();
   if (!sn->isEntryNode())
     return Ein;
 
-  DFNode* pn = getParent();
+  DFNode *pn = getParent();
   if (pn->isRoot())
     return Ein;
 
-  DFEdge* PEin = pn->getInDFEdgeAt(inPort);
-  DFInternalNode* SPN = dyn_cast<DFInternalNode>(PEin->getSourceDF());
+  DFEdge *PEin = pn->getInDFEdgeAt(inPort);
+  DFInternalNode *SPN = dyn_cast<DFInternalNode>(PEin->getSourceDF());
   if (!SPN)
     return PEin;
 
@@ -849,30 +757,31 @@ DFEdge* DFNode::getExtendedInDFEdgeAt(unsigned inPort) {
   return SPN->getChildGraph()->getExit()->getInDFEdgeAt(outPort);
 }
 
-DFEdge* DFNode::getOutDFEdgeAt(unsigned outPort) {
+DFEdge *DFNode::getOutDFEdgeAt(unsigned outPort) {
 
   // Cannot perform check for the number of outputs here,
   // it depends on the node's return type
 
-  for(outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e; ++i) {
-    DFEdge* E = *i;
-    if(outPort == E->getSourcePosition())
+  for (outdfedge_iterator i = outdfedge_begin(), e = outdfedge_end(); i != e;
+       ++i) {
+    DFEdge *E = *i;
+    if (outPort == E->getSourcePosition())
       return E;
   }
   return NULL;
 }
 
-DFEdge* DFNode::getExtendedOutDFEdgeAt(unsigned outPort) {
-  DFEdge* Eout = getOutDFEdgeAt(outPort);
+DFEdge *DFNode::getExtendedOutDFEdgeAt(unsigned outPort) {
+  DFEdge *Eout = getOutDFEdgeAt(outPort);
   if (!Eout->getDestDF()->isExitNode())
     return Eout;
 
-  DFNode* pn = getParent();
+  DFNode *pn = getParent();
   if (pn->isRoot())
     return Eout;
 
-  DFEdge* PEout = pn->getOutDFEdgeAt(outPort);
-  DFInternalNode* DPN = dyn_cast<DFInternalNode>(PEout->getDestDF());
+  DFEdge *PEout = pn->getOutDFEdgeAt(outPort);
+  DFInternalNode *DPN = dyn_cast<DFInternalNode>(PEout->getDestDF());
   if (!DPN)
     return PEout;
 
@@ -884,7 +793,7 @@ DFEdge* DFNode::getExtendedOutDFEdgeAt(unsigned outPort) {
 std::map<unsigned, unsigned> DFNode::getInArgMap() {
   std::map<unsigned, unsigned> map;
   for (unsigned i = 0; i < InDFEdges.size(); i++) {
-    DFEdge* E = getInDFEdgeAt(i);
+    DFEdge *E = getInDFEdgeAt(i);
     if (E->getSourceDF()->isAllocationNode())
       continue;
     unsigned pos = E->getSourcePosition();
@@ -894,13 +803,13 @@ std::map<unsigned, unsigned> DFNode::getInArgMap() {
 }
 
 // Only Allocation Nodes - only detect relevant indices
-std::map<unsigned, std::pair<Value*, unsigned> > DFNode::getSharedInArgMap() {
-  std::map<unsigned, std::pair<Value*, unsigned> > map;
+std::map<unsigned, std::pair<Value *, unsigned>> DFNode::getSharedInArgMap() {
+  std::map<unsigned, std::pair<Value *, unsigned>> map;
   for (unsigned i = 0; i < InDFEdges.size(); i++) {
-    DFEdge* E = getInDFEdgeAt(i);
+    DFEdge *E = getInDFEdgeAt(i);
     if (!E->getSourceDF()->isAllocationNode())
       continue;
-    map[i] = std::pair<Value *, unsigned>(NULL,0);
+    map[i] = std::pair<Value *, unsigned>(NULL, 0);
   }
   return map;
 }
@@ -908,18 +817,18 @@ std::map<unsigned, std::pair<Value*, unsigned> > DFNode::getSharedInArgMap() {
 std::vector<unsigned> DFNode::getOutArgMap() {
   std::vector<unsigned> map(OutDFEdges.size());
   for (unsigned i = 0; i < OutDFEdges.size(); i++) {
-    DFEdge* E = getOutDFEdgeAt(i);
+    DFEdge *E = getOutDFEdgeAt(i);
     unsigned pos = E->getDestPosition();
     map[pos] = i;
   }
   return map;
 }
 
-int DFNode::getAncestorHops(DFNode* N) {
-  DFNode* temp = this;
+int DFNode::getAncestorHops(DFNode *N) {
+  DFNode *temp = this;
   int hops = 0;
   while (temp != NULL) {
-    if(temp == N)
+    if (temp == N)
       return hops;
     temp = temp->getParent();
     hops++;
@@ -938,22 +847,24 @@ int DFNode::getAncestorHops(DFNode* N) {
 bool DFNode::hasSideEffects() {
   bool hasSideEffects = false;
   // Check #1: No incoming pointer argument
-  for(DFEdge* E: this->InDFEdges) {
+  for (DFEdge *E : this->InDFEdges) {
     hasSideEffects |= E->getType()->isPointerTy();
   }
   return hasSideEffects;
 }
 
 //===--------------------- DFInternalNode Outlined Functions --------------===//
-void DFInternalNode::addEdgeToDFGraph(DFEdge* E) {
-  DFNode* S = E->getSourceDF();
-  DFNode* D = E->getDestDF();
+void DFInternalNode::addEdgeToDFGraph(DFEdge *E) {
+  DFNode *S = E->getSourceDF();
+  DFNode *D = E->getDestDF();
 
-  assert(std::find(childGraph->begin(), childGraph->end(), S)!=childGraph->end()
-    && "Source node not found in child dataflow graph!");
+  assert(std::find(childGraph->begin(), childGraph->end(), S) !=
+             childGraph->end() &&
+         "Source node not found in child dataflow graph!");
 
-  assert(std::find(childGraph->begin(), childGraph->end(), D)!=childGraph->end()
-    && "Destination node not found in child dataflow graph!");
+  assert(std::find(childGraph->begin(), childGraph->end(), D) !=
+             childGraph->end() &&
+         "Destination node not found in child dataflow graph!");
 
   // Update Graph
   childGraph->addDFEdge(E);
@@ -964,33 +875,29 @@ void DFInternalNode::addEdgeToDFGraph(DFEdge* E) {
   D->addInDFEdge(E);
 
   // Update Rank
-  if(D->getRank() <= S->getRank())
-    D->setRank(S->getRank()+1);
+  if (D->getRank() <= S->getRank())
+    D->setRank(S->getRank() + 1);
 }
 
 //===------------------------ Property Objects ---------------------------====//
 class AllocationNodeProperty {
-  public:
-  typedef std::pair<DFEdge*, Value*> AllocationType;
+public:
+  typedef std::pair<DFEdge *, Value *> AllocationType;
   typedef std::vector<AllocationType> AllocationListType;
 
-  private:
-    AllocationListType AllocationList;
+private:
+  AllocationListType AllocationList;
 
-  public:
-    AllocationNodeProperty() {}
+public:
+  AllocationNodeProperty() {}
 
-    unsigned getNumAllocations() {
-      return AllocationList.size();
-    }
+  unsigned getNumAllocations() { return AllocationList.size(); }
 
-    AllocationListType getAllocationList() {
-      return AllocationList;
-    }
+  AllocationListType getAllocationList() { return AllocationList; }
 
-    void insertAllocation(DFEdge* E, Value* V) {
-      AllocationList.push_back(AllocationType(E,V));
-    }
+  void insertAllocation(DFEdge *E, Value *V) {
+    AllocationList.push_back(AllocationType(E, V));
+  }
 };
 
 //===-------------------------- Visitor Classes ---------------------------===//
@@ -998,42 +905,40 @@ class AllocationNodeProperty {
 class DFNodeVisitor {
 public:
   virtual ~DFNodeVisitor() {}
-  virtual void visit(DFInternalNode* N) = 0;
-  virtual void visit(DFLeafNode* N) = 0;
+  virtual void visit(DFInternalNode *N) = 0;
+  virtual void visit(DFLeafNode *N) = 0;
 };
 
-void DFInternalNode::applyDFNodeVisitor(DFNodeVisitor &V) {
-  V.visit(this);
-}
+void DFInternalNode::applyDFNodeVisitor(DFNodeVisitor &V) { V.visit(this); }
 
-void DFLeafNode::applyDFNodeVisitor(DFNodeVisitor &V) {
-  V.visit(this);
-}
+void DFLeafNode::applyDFNodeVisitor(DFNodeVisitor &V) { V.visit(this); }
 
 class DFTreeTraversal : public DFNodeVisitor {
 
 public:
   virtual ~DFTreeTraversal() {}
 
-  virtual void visit(DFInternalNode* N){
-    DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName() << "\n");
-    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
-        e = N->getChildGraph()->end(); i != e; ++i) {
-      DFNode* child = *i;
+  virtual void visit(DFInternalNode *N) {
+    DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName()
+                 << "\n");
+    for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
+                                    e = N->getChildGraph()->end();
+         i != e; ++i) {
+      DFNode *child = *i;
       child->applyDFNodeVisitor(*this);
     }
   }
 
-  virtual void visit(DFLeafNode* N) {
-    DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName() << "\n");
+  virtual void visit(DFLeafNode *N) {
+    DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName()
+                 << "\n");
   }
-
 };
 
 class FollowSuccessors : public DFNodeVisitor {
 
 public:
-  virtual void visit(DFInternalNode* N) {
+  virtual void visit(DFInternalNode *N) {
     /*DFNodeListType L; // Empty List that will contain the sorted elements
     DFNodeListType S; // Set of all nodes with no incoming edges
 
@@ -1047,9 +952,11 @@ public:
         if
       }
     }*/
-    DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName() << "\n");
-    for(DFInternalNode::successor_iterator i = N->successors_begin(),
-        e = N->successors_end(); i != e; ++i) {
+    DEBUG(errs() << "Visited Node (I) - " << N->getFuncPointer()->getName()
+                 << "\n");
+    for (DFInternalNode::successor_iterator i = N->successors_begin(),
+                                            e = N->successors_end();
+         i != e; ++i) {
       /* Traverse the graph.
        * Choose the kind of traversal we want
        * Do we do a DAG kind of traversal?
@@ -1057,67 +964,68 @@ public:
     }
   }
 
-  virtual void visit(DFLeafNode* N) {
-    DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName() << "\n");
+  virtual void visit(DFLeafNode *N) {
+    DEBUG(errs() << "Visited Node (L) - " << N->getFuncPointer()->getName()
+                 << "\n");
   }
 };
 
 class ReplaceNodeFunction : public DFNodeVisitor {
 
 protected:
-  //Member variables
+  // Member variables
   Module &M;
-  Function* F = NULL; // Function to replace
-  Function* G = NULL; // Function to be replaced by
+  Function *F = NULL; // Function to replace
+  Function *G = NULL; // Function to be replaced by
 
   // Functions
-  void replaceNodeFunction(DFInternalNode* N) {
+  void replaceNodeFunction(DFInternalNode *N) {
     if (N->getFuncPointer() == F)
       N->setFuncPointer(G);
   }
 
-  void replaceNodeFunction(DFLeafNode* N) {
+  void replaceNodeFunction(DFLeafNode *N) {
     if (N->getFuncPointer() == F)
       N->setFuncPointer(G);
   }
 
-  ~ReplaceNodeFunction() {};
+  ~ReplaceNodeFunction(){};
 
 public:
-
   // Constructor
-  ReplaceNodeFunction(Module &_M,
-    Function* _F, Function* _G) : M(_M), F(_F), G(_G) {}
+  ReplaceNodeFunction(Module &_M, Function *_F, Function *_G)
+      : M(_M), F(_F), G(_G) {}
 
   ReplaceNodeFunction(Module &_M) : M(_M), F(NULL), G(NULL) {}
 
-  void setF(Function* _F) {
-    F = _F;
-  }
+  void setF(Function *_F) { F = _F; }
 
-  void setG(Function* _G) {
-    G = _G;
-  }
+  void setG(Function *_G) { G = _G; }
 
-  virtual void visit(DFInternalNode* N) {
-    DEBUG(errs() << "Start: Replace Node Function for Node (I) - " << N->getFuncPointer()->getName() << "\n");
+  virtual void visit(DFInternalNode *N) {
+    DEBUG(errs() << "Start: Replace Node Function for Node (I) - "
+                 << N->getFuncPointer()->getName() << "\n");
 
     // Follows a bottom-up approach.
-    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
-        e = N->getChildGraph()->end(); i != e; ++i) {
-      DFNode* child = *i;
+    for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
+                                    e = N->getChildGraph()->end();
+         i != e; ++i) {
+      DFNode *child = *i;
       child->applyDFNodeVisitor(*this);
     }
     // Generate code for this internal node now. This way all the cloned
     // functions for children exist.
     replaceNodeFunction(N);
-    DEBUG(errs() << "DONE: Replace Node Function for Node (I) - " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "DONE: Replace Node Function for Node (I) - "
+                 << N->getFuncPointer()->getName() << "\n");
   }
 
-  virtual void visit(DFLeafNode* N) {
-    DEBUG(errs() << "Start: Replace Node Function for Node (L) - " << N->getFuncPointer()->getName() << "\n");
+  virtual void visit(DFLeafNode *N) {
+    DEBUG(errs() << "Start: Replace Node Function for Node (L) - "
+                 << N->getFuncPointer()->getName() << "\n");
     replaceNodeFunction(N);
-    DEBUG(errs() << "DONE: Replace Node Function for Node (L) - " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "DONE: Replace Node Function for Node (L) - "
+                 << N->getFuncPointer()->getName() << "\n");
   }
 };
 
@@ -1133,7 +1041,7 @@ public:
 // GraphTraits specializations for DFNode graph (DFG)
 //===--------------------------------------------------------------------===//
 
-template <> struct GraphTraits<DFNode*> {
+template <> struct GraphTraits<DFNode *> {
   typedef DFNode *NodeRef;
   typedef typename DFNode::successor_iterator ChildIteratorType;
 
@@ -1143,110 +1051,100 @@ template <> struct GraphTraits<DFNode*> {
   static inline ChildIteratorType child_end(NodeRef N) {
     return N->successors_end();
   }
-
 };
 
-template <> struct GraphTraits<DFGraph*> : public GraphTraits<DFNode*> {
-  typedef typename DFGraph::children_iterator nodes_iterator; 
+template <> struct GraphTraits<DFGraph *> : public GraphTraits<DFNode *> {
+  typedef typename DFGraph::children_iterator nodes_iterator;
 
-  static NodeRef getEntryNode(DFGraph* G) {
-    return G->front();
-  }
+  static NodeRef getEntryNode(DFGraph *G) { return G->front(); }
 
-  static nodes_iterator nodes_begin(DFGraph *G) {
-    return G->begin();
-  }
+  static nodes_iterator nodes_begin(DFGraph *G) { return G->begin(); }
 
-  static inline nodes_iterator nodes_end(DFGraph *G) {
-    return G->end();
-  }
+  static inline nodes_iterator nodes_end(DFGraph *G) { return G->end(); }
 };
 
-template<>
-struct DOTGraphTraits<DFGraph*> : public DefaultDOTGraphTraits {
+template <> struct DOTGraphTraits<DFGraph *> : public DefaultDOTGraphTraits {
 
-  DOTGraphTraits (bool isSimple=false)
-    : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-  static std::string getGraphName(DFGraph* G) {
-    DFInternalNode* Parent = G->getParent();
-    if(Parent != NULL)
+  static std::string getGraphName(DFGraph *G) {
+    DFInternalNode *Parent = G->getParent();
+    if (Parent != NULL)
       return Parent->getFuncPointer()->getName();
     else
       return "Dataflow Graph";
   }
 
-  static std::string getGraphProperties(DFGraph* G) {
+  static std::string getGraphProperties(DFGraph *G) {
     return "\tcompound=true;";
   }
 
-  std::string getNodeLabel (DFNode* N, DFGraph* G) {
-    if(N->isEntryNode())
+  std::string getNodeLabel(DFNode *N, DFGraph *G) {
+    if (N->isEntryNode())
       return "Entry";
-    if(N->isExitNode())
+    if (N->isExitNode())
       return "Exit";
     return N->getFuncPointer()->getName();
   }
 
-  static bool isCompoundNode(DFNode* N) {
+  static bool isCompoundNode(DFNode *N) {
     bool ret = isa<DFInternalNode>(N);
     return ret;
   }
-  
-  static DFGraph* getSubGraph(DFNode* N, DFGraph* G) {
-    DFInternalNode* IN = dyn_cast<DFInternalNode>(N);
+
+  static DFGraph *getSubGraph(DFNode *N, DFGraph *G) {
+    DFInternalNode *IN = dyn_cast<DFInternalNode>(N);
     assert(IN && "No subgraph for leaf dataflow node!");
     return IN->getChildGraph();
   }
 
-  static DFNode* getAnySimpleNodeForSrc(DFNode* N) {
-    DFInternalNode* IN = dyn_cast<DFInternalNode>(N);
+  static DFNode *getAnySimpleNodeForSrc(DFNode *N) {
+    DFInternalNode *IN = dyn_cast<DFInternalNode>(N);
     assert(IN && "No subgraph for leaf dataflow node!");
     return IN->getChildGraph()->getExit();
   }
 
-  static DFNode* getAnySimpleNodeForDest(DFNode* N) {
-    DFInternalNode* IN = dyn_cast<DFInternalNode>(N);
+  static DFNode *getAnySimpleNodeForDest(DFNode *N) {
+    DFInternalNode *IN = dyn_cast<DFInternalNode>(N);
     assert(IN && "No subgraph for leaf dataflow node!");
     return IN->getChildGraph()->getEntry();
   }
 
-  static std::string getNodeAttributes(DFNode* N, DFGraph* G) {
+  static std::string getNodeAttributes(DFNode *N, DFGraph *G) {
     std::string Attr = "";
     raw_string_ostream OS(Attr);
     OS << "shape=oval";
     return OS.str();
   }
 
-  static std::string getEdgeAttributes(DFNode* N, DFNode::successor_iterator SI, DFGraph* G) {
+  static std::string getEdgeAttributes(DFNode *N, DFNode::successor_iterator SI,
+                                       DFGraph *G) {
     std::string Attr = "";
     raw_string_ostream OS(Attr);
     bool comma = false;
-    if(DFInternalNode* SrcNode = dyn_cast<DFInternalNode>(N)) {
+    if (DFInternalNode *SrcNode = dyn_cast<DFInternalNode>(N)) {
       comma = true;
-      OS << "ltail=cluster"; 
-      OS << static_cast<const void*>(SrcNode);
+      OS << "ltail=cluster";
+      OS << static_cast<const void *>(SrcNode);
     }
-    DFNode* DN = *SI;
-    if(DFInternalNode* DestNode = dyn_cast<DFInternalNode>(DN)) {
-      if(comma)
+    DFNode *DN = *SI;
+    if (DFInternalNode *DestNode = dyn_cast<DFInternalNode>(DN)) {
+      if (comma)
         OS << ", ";
-      OS << "lhead=cluster"; 
-      OS << static_cast<const void*>(DestNode);
+      OS << "lhead=cluster";
+      OS << static_cast<const void *>(DestNode);
     }
     return OS.str();
   }
 
-  static void addCustomGraphFeatures(DFGraph* G, GraphWriter<DFGraph*> &GW) {
-
-  }
+  static void addCustomGraphFeatures(DFGraph *G, GraphWriter<DFGraph *> &GW) {}
 };
 
 void viewDFGraph(DFGraph *G) {
   llvm::WriteGraph(G, "DataflowGraph");
-  //llvm::ViewGraph(G, "DataflowGraph");
+  // llvm::ViewGraph(G, "DataflowGraph");
 }
 
-} // End llvm namespace
+} // namespace llvm
 
 #endif
diff --git a/hpvm/include/SupportVISC/VISCHint.h b/hpvm/include/SupportVISC/VISCHint.h
index 5324c0fabddeef5f85a540176ffffb278ac1dfdf..99266b071843ab0417ea73c6e4533dfa381d52cd 100644
--- a/hpvm/include/SupportVISC/VISCHint.h
+++ b/hpvm/include/SupportVISC/VISCHint.h
@@ -15,21 +15,21 @@
 namespace visc {
 #endif
 
-  enum Target {
-    None,
-    CPU_TARGET,
-    GPU_TARGET,
-    SPIR_TARGET,
-    CUDNN_TARGET,
-    PROMISE_TARGET,
-    CPU_OR_GPU_TARGET,
-    CPU_OR_SPIR_TARGET,
-//    ALL_TARGETS,
-    NUM_TARGETS
-  };
+enum Target {
+  None,
+  CPU_TARGET,
+  GPU_TARGET,
+  SPIR_TARGET,
+  CUDNN_TARGET,
+  PROMISE_TARGET,
+  CPU_OR_GPU_TARGET,
+  CPU_OR_SPIR_TARGET,
+  //    ALL_TARGETS,
+  NUM_TARGETS
+};
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif //VISC_HINT_HEADER
+#endif // VISC_HINT_HEADER
diff --git a/hpvm/include/SupportVISC/VISCTimer.h b/hpvm/include/SupportVISC/VISCTimer.h
index 4dbadbd34f47e8fd35413317a5df28ba0589e3d5..ce3dc8a5e0f7c77ff06fec5857f223ca4f0e142f 100644
--- a/hpvm/include/SupportVISC/VISCTimer.h
+++ b/hpvm/include/SupportVISC/VISCTimer.h
@@ -27,57 +27,53 @@ enum visc_TimerState {
 
 struct visc_Timer {
   enum visc_TimerState state;
-  visc_Timestamp elapsed;       /* Amount of time elapsed so far */
-  visc_Timestamp init;		/* Beginning of the current time interval,
-				 * if state is RUNNING.  End of the last
-				 * recorded time interfal otherwise.  */
+  visc_Timestamp elapsed; /* Amount of time elapsed so far */
+  visc_Timestamp init;    /* Beginning of the current time interval,
+                           * if state is RUNNING.  End of the last
+                           * recorded time interfal otherwise.  */
 };
 
 /* Reset a timer.
  * Use this to initialize a timer or to clear
  * its elapsed time.  The reset timer is stopped.
  */
-void
-visc_ResetTimer(struct visc_Timer *timer);
+void visc_ResetTimer(struct visc_Timer *timer);
 
 /* Start a timer.  The timer is set to RUNNING mode and
  * time elapsed while the timer is running is added to
  * the timer.
  * The timer should not already be running.
  */
-void
-visc_StartTimer(struct visc_Timer *timer);
+void visc_StartTimer(struct visc_Timer *timer);
 
 /* Stop a timer.
  * This stops adding elapsed time to the timer.
  * The timer should not already be stopped.
  */
-void
-visc_StopTimer(struct visc_Timer *timer);
+void visc_StopTimer(struct visc_Timer *timer);
 
 /* Get the elapsed time in seconds. */
-double
-visc_GetElapsedTime(struct visc_Timer *timer);
+double visc_GetElapsedTime(struct visc_Timer *timer);
 
 /* Execution time is assigned to one of these categories. */
 enum visc_TimerID {
   visc_TimerID_NONE = 0,
-  visc_TimerID_IO,		/* Time spent in input/output */
-  visc_TimerID_KERNEL,		/* Time spent computing on the device,
-				 * recorded asynchronously */
-  visc_TimerID_COPY,		/* Time spent synchronously moving data
-				 * to/from device and allocating/freeing
-				 * memory on the device */
-  visc_TimerID_DRIVER,		/* Time spent in the host interacting with the
-				 * driver, primarily for recording the time
-                                 * spent queueing asynchronous operations */
-  visc_TimerID_COPY_ASYNC,	/* Time spent in asynchronous transfers */
-  visc_TimerID_COMPUTE,		/* Time for all program execution other
-				 * than parsing command line arguments,
-				 * I/O, kernel, and copy */
-  visc_TimerID_OVERLAP,		/* Time double-counted in asynchronous and
-				 * host activity: automatically filled in,
-				 * not intended for direct usage */
+  visc_TimerID_IO,         /* Time spent in input/output */
+  visc_TimerID_KERNEL,     /* Time spent computing on the device,
+                            * recorded asynchronously */
+  visc_TimerID_COPY,       /* Time spent synchronously moving data
+                            * to/from device and allocating/freeing
+                            * memory on the device */
+  visc_TimerID_DRIVER,     /* Time spent in the host interacting with the
+                            * driver, primarily for recording the time
+                            * spent queueing asynchronous operations */
+  visc_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
+  visc_TimerID_COMPUTE,    /* Time for all program execution other
+                            * than parsing command line arguments,
+                            * I/O, kernel, and copy */
+  visc_TimerID_OVERLAP,    /* Time double-counted in asynchronous and
+                            * host activity: automatically filled in,
+                            * not intended for direct usage */
   // GPU FUNCTION
   visc_TimerID_INIT_CTX,
   visc_TimerID_CLEAR_CTX,
@@ -97,16 +93,16 @@ enum visc_TimerID {
   visc_TimerID_OUTPUT_PACK,
   visc_TimerID_OUTPUT_UNPACK,
 
-  visc_TimerID_LAST		/* Number of timer IDs */
+  visc_TimerID_LAST /* Number of timer IDs */
 };
 
 /* Dynamic list of asynchronously tracked times between events */
 struct visc_async_time_marker_list {
-  char *label; // actually just a pointer to a string
-  enum visc_TimerID timerID;	/* The ID to which the interval beginning
-                                 * with this marker should be attributed */
-  void * marker;
-  //cudaEvent_t marker; 		/* The driver event for this marker */
+  char *label;               // actually just a pointer to a string
+  enum visc_TimerID timerID; /* The ID to which the interval beginning
+                              * with this marker should be attributed */
+  void *marker;
+  // cudaEvent_t marker; 		/* The driver event for this marker */
   struct visc_async_time_marker_list *next;
 };
 
@@ -124,7 +120,7 @@ struct visc_SubTimerList {
 /* A set of timers for recording execution times. */
 struct visc_TimerSet {
   enum visc_TimerID current;
-  struct visc_async_time_marker_list* async_markers;
+  struct visc_async_time_marker_list *async_markers;
   visc_Timestamp async_begin;
   visc_Timestamp wall_begin;
   struct visc_Timer timers[visc_TimerID_LAST];
@@ -132,28 +128,24 @@ struct visc_TimerSet {
 };
 
 /* Reset all timers in the set. */
-void
-visc_InitializeTimerSet(struct visc_TimerSet *timers);
+void visc_InitializeTimerSet(struct visc_TimerSet *timers);
 
-void
-visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category);
+void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
+                      enum visc_TimerID visc_Category);
 
 /* Select which timer the next interval of time should be accounted
  * to. The selected timer is started and other timers are stopped.
  * Using visc_TimerID_NONE stops all timers. */
-inline void
-visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer);
+inline void visc_SwitchToTimer(struct visc_TimerSet *timers,
+                               enum visc_TimerID timer);
 
-void
-visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category);
+void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
+                           enum visc_TimerID category);
 
 /* Print timer values to standard output. */
-void
-visc_PrintTimerSet(struct visc_TimerSet *timers);
+void visc_PrintTimerSet(struct visc_TimerSet *timers);
 
 /* Release timer resources */
-void
-visc_DestroyTimerSet(struct visc_TimerSet * timers);
-
+void visc_DestroyTimerSet(struct visc_TimerSet *timers);
 }
-#endif //VISC_RT_HEADER
+#endif // VISC_RT_HEADER
diff --git a/hpvm/include/SupportVISC/VISCUtils.h b/hpvm/include/SupportVISC/VISCUtils.h
index 325acfaf1993964bc93d98eadd7ee06df0fd7140..0efd20b5b5eb57943de1feb6d2afa886c6c48a5c 100644
--- a/hpvm/include/SupportVISC/VISCUtils.h
+++ b/hpvm/include/SupportVISC/VISCUtils.h
@@ -12,18 +12,18 @@
 #define VISC_UTILS_HEADER
 
 #include <assert.h>
- 
-#include "llvm/IR/Module.h"
+
+#include "SupportVISC/VISCHint.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
-#include "llvm/IR/Metadata.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "SupportVISC/VISCHint.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 
@@ -32,125 +32,126 @@ using namespace llvm;
 namespace viscUtils {
 // Helper Functions
 
-static bool isViscCreateNodeIntrinsic(Instruction* I) {
-  if(!isa<IntrinsicInst>(I))
+static bool isViscCreateNodeIntrinsic(Instruction *I) {
+  if (!isa<IntrinsicInst>(I))
     return false;
-  IntrinsicInst* II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc.createNode");
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
+  return (II->getCalledFunction()->getName())
+      .startswith("llvm.visc.createNode");
 }
 
-static bool isViscCreateNodeCall(Instruction* I) {
-  if(!isa<CallInst>(I))
+static bool isViscCreateNodeCall(Instruction *I) {
+  if (!isa<CallInst>(I))
     return false;
-  CallInst* CI = cast<CallInst>(I);
-  return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__createNode");
+  CallInst *CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName())
+      .startswith("__visc__createNode");
 }
 
-static bool isViscLaunchCall(Instruction* I) {
-  if(!isa<CallInst>(I))
+static bool isViscLaunchCall(Instruction *I) {
+  if (!isa<CallInst>(I))
     return false;
-  CallInst* CI = cast<CallInst>(I);
-  return (CI->getCalledValue()->stripPointerCasts()->getName()).startswith("__visc__launch");
+  CallInst *CI = cast<CallInst>(I);
+  return (CI->getCalledValue()->stripPointerCasts()->getName())
+      .startswith("__visc__launch");
 }
 // Creates a new createNode intrinsic, similar to II but with different
 // associated function F instead
-IntrinsicInst* createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function* F,
-                                                                       IntrinsicInst* II) {
-  Module* M = F->getParent();
+IntrinsicInst *
+createIdenticalCreateNodeIntrinsicWithDifferentFunction(Function *F,
+                                                        IntrinsicInst *II) {
+  Module *M = F->getParent();
 
   // Find which createNode intrinsic we need to create
-  Function* CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID());
-  Constant* Fp = ConstantExpr::getPointerCast(F,
-                                          Type::getInt8PtrTy(II->getContext()));
+  Function *CreateNodeF = Intrinsic::getDeclaration(M, II->getIntrinsicID());
+  Constant *Fp =
+      ConstantExpr::getPointerCast(F, Type::getInt8PtrTy(II->getContext()));
 
-  ArrayRef<Value*> CreateNodeArgs;
+  ArrayRef<Value *> CreateNodeArgs;
   switch (II->getIntrinsicID()) {
-    case Intrinsic::visc_createNode:
-    {
-      CreateNodeArgs = ArrayRef<Value*>(Fp);
-      break;
-    }
-    case Intrinsic::visc_createNode1D:
-    {
-      Value* CreateNode1DArgs[] = {Fp, II->getArgOperand(1)};
-      CreateNodeArgs = ArrayRef<Value*>(CreateNode1DArgs, 2);
-      break;
-    }
-    case Intrinsic::visc_createNode2D:
-    {
-      Value* CreateNode2DArgs[] = {Fp, II->getArgOperand(1),
-                                       II->getArgOperand(2)};
-      CreateNodeArgs = ArrayRef<Value*>(CreateNode2DArgs, 3);
-      break;
-    }
-    case Intrinsic::visc_createNode3D:
-    {
-      Value* CreateNode3DArgs[] = {Fp, II->getArgOperand(1),
-                                       II->getArgOperand(2),
-                                       II->getArgOperand(3)};
-      CreateNodeArgs = ArrayRef<Value*>(CreateNode3DArgs, 4);
-      break;
-    }
-    default :
-      assert(false && "Unknown createNode intrinsic");
-      break;
+  case Intrinsic::visc_createNode: {
+    CreateNodeArgs = ArrayRef<Value *>(Fp);
+    break;
+  }
+  case Intrinsic::visc_createNode1D: {
+    Value *CreateNode1DArgs[] = {Fp, II->getArgOperand(1)};
+    CreateNodeArgs = ArrayRef<Value *>(CreateNode1DArgs, 2);
+    break;
+  }
+  case Intrinsic::visc_createNode2D: {
+    Value *CreateNode2DArgs[] = {Fp, II->getArgOperand(1),
+                                 II->getArgOperand(2)};
+    CreateNodeArgs = ArrayRef<Value *>(CreateNode2DArgs, 3);
+    break;
+  }
+  case Intrinsic::visc_createNode3D: {
+    Value *CreateNode3DArgs[] = {Fp, II->getArgOperand(1), II->getArgOperand(2),
+                                 II->getArgOperand(3)};
+    CreateNodeArgs = ArrayRef<Value *>(CreateNode3DArgs, 4);
+    break;
+  }
+  default:
+    assert(false && "Unknown createNode intrinsic");
+    break;
   }
 
-  CallInst* CI = CallInst::Create(CreateNodeF,
-                                  CreateNodeArgs,
-                                  F->getName()+".node");
-  IntrinsicInst* CreateNodeII = cast<IntrinsicInst>(CI);
+  CallInst *CI =
+      CallInst::Create(CreateNodeF, CreateNodeArgs, F->getName() + ".node");
+  IntrinsicInst *CreateNodeII = cast<IntrinsicInst>(CI);
   return CreateNodeII;
 }
 
 // Fix VISC hints for this function
-void fixHintMetadata(Module &M, Function* F, Function* G) {
-    Metadata* MD_F = ValueAsMetadata::getIfExists(F);
-    MDTuple* MDT_F = MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata*>(MD_F));
-    DEBUG(errs() << "Associated Metadata: " << *MDT_F << "\n");
-    MDTuple* MDT_G = MDNode::get(F->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(G)));
-    DEBUG(errs() << "New Metadata: " << *MDT_G << "\n");
-
-    auto FixHint = [&](StringRef Name) {
- 	NamedMDNode* HintNode = M.getOrInsertNamedMetadata(Name);
-    	for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-          if(HintNode->getOperand(i) == MDT_F)
-             HintNode->setOperand(i, MDT_G);
-    	}	
-    };
-
-    FixHint("visc_hint_gpu");
-    FixHint("visc_hint_cpu");
-    FixHint("visc_hint_cpu_gpu");
+void fixHintMetadata(Module &M, Function *F, Function *G) {
+  Metadata *MD_F = ValueAsMetadata::getIfExists(F);
+  MDTuple *MDT_F =
+      MDTuple::getIfExists(F->getContext(), ArrayRef<Metadata *>(MD_F));
+  DEBUG(errs() << "Associated Metadata: " << *MDT_F << "\n");
+  MDTuple *MDT_G = MDNode::get(F->getContext(),
+                               ArrayRef<Metadata *>(ValueAsMetadata::get(G)));
+  DEBUG(errs() << "New Metadata: " << *MDT_G << "\n");
+
+  auto FixHint = [&](StringRef Name) {
+    NamedMDNode *HintNode = M.getOrInsertNamedMetadata(Name);
+    for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+      if (HintNode->getOperand(i) == MDT_F)
+        HintNode->setOperand(i, MDT_G);
+    }
+  };
+
+  FixHint("visc_hint_gpu");
+  FixHint("visc_hint_cpu");
+  FixHint("visc_hint_cpu_gpu");
 }
 
 // Assuming that the changed function is a node function, it is only used as a
 // first operand of createNode*. It is enough to iterate through all createNode*
 // calls in the program.
-void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) {
+void replaceNodeFunctionInIR(Module &M, Function *F, Function *G) {
 
   for (auto &Func : M) {
     DEBUG(errs() << "Function: " << Func.getName() << "\n");
 
-    std::vector<Instruction*> toBeErased;
+    std::vector<Instruction *> toBeErased;
 
-    for (inst_iterator i = inst_begin(&Func), e = inst_end(&Func); i != e ; ++i) {
-      Instruction* I = &*i; // Grab pointer to Instruction
+    for (inst_iterator i = inst_begin(&Func), e = inst_end(&Func); i != e;
+         ++i) {
+      Instruction *I = &*i; // Grab pointer to Instruction
 
       if (isViscCreateNodeIntrinsic(I)) {
-        IntrinsicInst* II = cast<IntrinsicInst>(I);
+        IntrinsicInst *II = cast<IntrinsicInst>(I);
         // The found createNode is not associated with the changed function
         if (II->getArgOperand(0) != F)
           continue; // skip it
 
         // Otherwise, create a new createNode similar to the other one,
         // but with the changed function as first operand.
-        IntrinsicInst* CreateNodeII =
-          createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II);
+        IntrinsicInst *CreateNodeII =
+            createIdenticalCreateNodeIntrinsicWithDifferentFunction(G, II);
         II->replaceAllUsesWith(CreateNodeII);
         toBeErased.push_back(II);
       } else if (isViscCreateNodeCall(I)) {
-        CallInst* CI = cast<CallInst>(I);
+        CallInst *CI = cast<CallInst>(I);
         // The found createNode is not associated with the changed function
         if (CI->getArgOperand(1) != F)
           continue; // skip it
@@ -160,8 +161,8 @@ void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) {
         // Replace use of F with use of G
         CI->setArgOperand(1, G);
         DEBUG(errs() << "Fixed use: " << *CI << "\n");
-      } else if(isViscLaunchCall(I)) {
-        CallInst* CI = cast<CallInst>(I);
+      } else if (isViscLaunchCall(I)) {
+        CallInst *CI = cast<CallInst>(I);
         // The found launch call is not associated with the changed function
         if (CI->getArgOperand(1)->stripPointerCasts() != F)
           continue;
@@ -171,31 +172,29 @@ void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) {
         DEBUG(errs() << *CI->getArgOperand(1)->getType() << "\n");
         CI->setArgOperand(1, G);
       }
-
     }
 
-    for(auto I: toBeErased) {
+    for (auto I : toBeErased) {
       DEBUG(errs() << "\tErasing " << *I << "\n");
       I->eraseFromParent();
     }
   }
 
   // Check if the function is used by a metadata node
-  if(F->isUsedByMetadata()) {
+  if (F->isUsedByMetadata()) {
     fixHintMetadata(M, F, G);
   }
-  DEBUG(errs() << "DONE: Replacing function " << F->getName() << " with " << G->getName() << "\n");
+  DEBUG(errs() << "DONE: Replacing function " << F->getName() << " with "
+               << G->getName() << "\n");
 
   // Remove replaced function from the module
-  //assert(F->user_empty() && "Still some uses of older function left\n");
+  // assert(F->user_empty() && "Still some uses of older function left\n");
   F->replaceAllUsesWith(UndefValue::get(F->getType()));
   F->eraseFromParent();
-
 }
 
-
-// Create new function F' as a copy of old function F with a new signature and input VMAP.
-// The following two most used cases are handled by this function.
+// Create new function F' as a copy of old function F with a new signature and
+// input VMAP. The following two most used cases are handled by this function.
 // 1. When some extra arguments need to be added to this function
 //    - Here we can map the old function arguments to
 //      new ones
@@ -204,77 +203,92 @@ void replaceNodeFunctionInIR(Module &M, Function* F, Function* G) {
 //      over extra pointer arguments.
 // The function returns the list of return instructions to the caller to fix in
 // case the return type is also changed.
-Function* cloneFunction(Function* F, FunctionType* newFT, 
-  bool isAddingPtrSizeArg, SmallVectorImpl<ReturnInst*>* Returns = NULL, std::vector<Argument*> *Args = NULL) {
+Function *cloneFunction(Function *F, FunctionType *newFT,
+                        bool isAddingPtrSizeArg,
+                        SmallVectorImpl<ReturnInst *> *Returns = NULL,
+                        std::vector<Argument *> *Args = NULL) {
 
   DEBUG(errs() << "Cloning Function: " << F->getName() << "\n");
   DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n");
   DEBUG(errs() << "New Function Type: " << *newFT << "\n");
 
-  assert(F->getFunctionType()->getNumParams() <= newFT->getNumParams()
-      && "This function assumes that the new function has more arguments than the old function!");
+  assert(F->getFunctionType()->getNumParams() <= newFT->getNumParams() &&
+         "This function assumes that the new function has more arguments than "
+         "the old function!");
 
   // Create Function of specified type
-  Function* newF = Function::Create(newFT, F->getLinkage(), F->getName()+"_cloned", F->getParent());
+  Function *newF = Function::Create(newFT, F->getLinkage(),
+                                    F->getName() + "_cloned", F->getParent());
   DEBUG(errs() << "Old Function name: " << F->getName() << "\n");
   DEBUG(errs() << "New Function name: " << newF->getName() << "\n");
   ValueToValueMapTy VMap;
   DEBUG(errs() << "No value map provided. Creating default value map\n");
-  if(isAddingPtrSizeArg) {
-    DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in new function\n");
+  if (isAddingPtrSizeArg) {
+    DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in "
+                    "new function\n");
     Function::arg_iterator new_ai = newF->arg_begin();
     if (Args == NULL) {
-      for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-          ai != ae; ++ai) {
-        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-        assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+      for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+           ai != ae; ++ai) {
+        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai
+                     << "\n");
+        assert(ai->getType() == new_ai->getType() &&
+               "Arguments type do not match!");
         VMap[&*ai] = &*new_ai;
         new_ai->takeName(&*ai);
-        if(ai->getType()->isPointerTy()) {
+        if (ai->getType()->isPointerTy()) {
           std::string oldName = new_ai->getName();
           // If the current argument is pointer type, the next argument in new
           // function would be an i64 type containing the data size of this
           // argument. Hence, skip the next arguement in new function.
           ++new_ai;
-          new_ai->setName("bytes_"+oldName);
+          new_ai->setName("bytes_" + oldName);
         }
         ++new_ai;
       }
     } else {
-      DEBUG(errs() << "Arguments of original function will be read from a vector!\n");
+      DEBUG(errs()
+            << "Arguments of original function will be read from a vector!\n");
       for (auto *ai : *(Args)) {
-        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-        assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai
+                     << "\n");
+        assert(ai->getType() == new_ai->getType() &&
+               "Arguments type do not match!");
         VMap[ai] = &*new_ai;
         new_ai->takeName(ai);
-        if(ai->getType()->isPointerTy()) {
+        if (ai->getType()->isPointerTy()) {
           std::string oldName = new_ai->getName();
           // If the current argument is pointer type, the next argument in new
           // function would be an i64 type containing the data size of this
           // argument. Hence, skip the next arguement in new function.
           ++new_ai;
-          new_ai->setName("bytes_"+oldName);
+          new_ai->setName("bytes_" + oldName);
         }
         ++new_ai;
-      } 
+      }
     }
-  }
-  else {
-    DEBUG(errs() << "Case 2: Extra arguments are added at the end of old function\n");
+  } else {
+    DEBUG(errs()
+          << "Case 2: Extra arguments are added at the end of old function\n");
     Function::arg_iterator new_ai = newF->arg_begin();
     if (Args == NULL) {
-      for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-          ai != ae; ++ai, ++new_ai) {
-        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-        assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+      for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+           ai != ae; ++ai, ++new_ai) {
+        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai
+                     << "\n");
+        assert(ai->getType() == new_ai->getType() &&
+               "Arguments type do not match!");
         VMap[&*ai] = &*new_ai;
         new_ai->takeName(&*ai);
       }
     } else {
-      DEBUG(errs() << "Arguments of original function will be read from a vector!\n");
+      DEBUG(errs()
+            << "Arguments of original function will be read from a vector!\n");
       for (auto *ai : *(Args)) {
-        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-        assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+        DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai
+                     << "\n");
+        assert(ai->getType() == new_ai->getType() &&
+               "Arguments type do not match!");
         VMap[ai] = &*new_ai;
         new_ai->takeName(ai);
         ++new_ai;
@@ -284,58 +298,62 @@ Function* cloneFunction(Function* F, FunctionType* newFT,
 
   // Clone function
   if (Returns == NULL)
-    Returns = new SmallVector<ReturnInst*, 8>();
+    Returns = new SmallVector<ReturnInst *, 8>();
   CloneFunctionInto(newF, F, VMap, false, *Returns);
 
   return newF;
 }
 
 // Overloaded version of cloneFunction
-Function *cloneFunction(Function *F, Function *newF,
-    bool isAddingPtrSizeArg,
-    SmallVectorImpl<ReturnInst *> *Returns = NULL) {
+Function *cloneFunction(Function *F, Function *newF, bool isAddingPtrSizeArg,
+                        SmallVectorImpl<ReturnInst *> *Returns = NULL) {
 
   DEBUG(errs() << "Cloning Function: " << F->getName() << "\n");
   DEBUG(errs() << "Old Function Type: " << *F->getFunctionType() << "\n");
   DEBUG(errs() << "New Function Type: " << *newF->getFunctionType() << "\n");
 
   assert(F->getFunctionType()->getNumParams() <=
-      newF->getFunctionType()->getNumParams() &&
-      "This function assumes that the new function has more arguments than "
-      "the old function!");
+             newF->getFunctionType()->getNumParams() &&
+         "This function assumes that the new function has more arguments than "
+         "the old function!");
 
   // Create Function of specified type
   DEBUG(errs() << "Old Function name: " << F->getName() << "\n");
   DEBUG(errs() << "New Function name: " << newF->getName() << "\n");
   ValueToValueMapTy VMap;
   DEBUG(errs() << "No value map provided. Creating default value map\n");
-  if(isAddingPtrSizeArg) {
-    DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in new function\n");
+  if (isAddingPtrSizeArg) {
+    DEBUG(errs() << "Case 1: Pointer arg followed by a i64 size argument in "
+                    "new function\n");
     Function::arg_iterator new_ai = newF->arg_begin();
-    for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-        ai != ae; ++ai) {
-      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-      assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+    for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+         ai != ae; ++ai) {
+      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai
+                   << "\n");
+      assert(ai->getType() == new_ai->getType() &&
+             "Arguments type do not match!");
       VMap[&*ai] = &*new_ai;
       new_ai->takeName(&*ai);
-      if(ai->getType()->isPointerTy()) {
+      if (ai->getType()->isPointerTy()) {
         std::string oldName = new_ai->getName();
         // If the current argument is pointer type, the next argument in new
         // function would be an i64 type containing the data size of this
         // argument. Hence, skip the next arguement in new function.
         ++new_ai;
-        new_ai->setName("bytes_"+oldName);
+        new_ai->setName("bytes_" + oldName);
       }
       ++new_ai;
     }
-  }
-  else {
-    DEBUG(errs() << "Case 2: Extra arguments are added at the end of old function\n");
+  } else {
+    DEBUG(errs()
+          << "Case 2: Extra arguments are added at the end of old function\n");
     Function::arg_iterator new_ai = newF->arg_begin();
-    for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-        ai != ae; ++ai, ++new_ai) {
-      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai << "\n");
-      assert(ai->getType() == new_ai->getType() && "Arguments type do not match!");
+    for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+         ai != ae; ++ai, ++new_ai) {
+      DEBUG(errs() << ai->getArgNo() << ". " << *ai << " : " << *new_ai
+                   << "\n");
+      assert(ai->getType() == new_ai->getType() &&
+             "Arguments type do not match!");
       VMap[&*ai] = &*new_ai;
       new_ai->takeName(&*ai);
     }
@@ -343,134 +361,133 @@ Function *cloneFunction(Function *F, Function *newF,
 
   // Clone function
   if (Returns == NULL)
-    Returns = new SmallVector<ReturnInst*, 8>();
+    Returns = new SmallVector<ReturnInst *, 8>();
   CloneFunctionInto(newF, F, VMap, false, *Returns);
 
   return newF;
 }
 
-
 //------------------- Helper Functions For Handling Hints -------------------//
 
 // Return true if 1st arg (tag) contains 2nd (target)
 bool tagIncludesTarget(visc::Target Tag, visc::Target T) {
   switch (Tag) {
-    case visc::None:
-      return false;
-    case visc::CPU_TARGET:
-      if (T == visc::CPU_TARGET)
-        return true;
-      return false;
-    case visc::GPU_TARGET:
-      if (T == visc::GPU_TARGET)
-        return true;
-      return false;
-    case visc::CPU_OR_GPU_TARGET:
-      if ((T == visc::CPU_TARGET) ||
-          (T == visc::GPU_TARGET) ||
-          (T == visc::CPU_OR_GPU_TARGET))
-        return true;
-      return false;
-    default:
-      assert(false && "Unknown Target\n");
+  case visc::None:
+    return false;
+  case visc::CPU_TARGET:
+    if (T == visc::CPU_TARGET)
+      return true;
+    return false;
+  case visc::GPU_TARGET:
+    if (T == visc::GPU_TARGET)
+      return true;
+    return false;
+  case visc::CPU_OR_GPU_TARGET:
+    if ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET) ||
+        (T == visc::CPU_OR_GPU_TARGET))
+      return true;
+    return false;
+  default:
+    assert(false && "Unknown Target\n");
   }
 }
 
 bool isSingleTargetTag(visc::Target T) {
-  return ((T == visc::CPU_TARGET)    ||
-      (T == visc::GPU_TARGET));
+  return ((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET));
 }
 
 // Add the specified target to the given tag
 visc::Target getUpdatedTag(visc::Target Tag, visc::Target T) {
-  assert(((T == visc::CPU_TARGET)    ||
-        (T == visc::GPU_TARGET)) &&
-      "The target is only allowed to be a single target: CPU, GPU, SPIR, CUDNN, PROMISE\n");
+  assert(((T == visc::CPU_TARGET) || (T == visc::GPU_TARGET)) &&
+         "The target is only allowed to be a single target: CPU, GPU, SPIR, "
+         "CUDNN, PROMISE\n");
 
   switch (Tag) {
-    case visc::None:
-      return T;
-    case visc::CPU_TARGET:
-      if (T == visc::CPU_TARGET)
-        return visc::CPU_TARGET;
-      if (T == visc::GPU_TARGET)
-        return visc::CPU_OR_GPU_TARGET;
-      return T;
-    case visc::GPU_TARGET:
-      if (T == visc::CPU_TARGET)
-        return visc::CPU_OR_GPU_TARGET;
-      if (T == visc::GPU_TARGET)
-        return visc::GPU_TARGET;
-      return T;
-    case visc::CPU_OR_GPU_TARGET:
+  case visc::None:
+    return T;
+  case visc::CPU_TARGET:
+    if (T == visc::CPU_TARGET)
+      return visc::CPU_TARGET;
+    if (T == visc::GPU_TARGET)
       return visc::CPU_OR_GPU_TARGET;
-    default:
-      assert(false && "Unknown Target\n");
+    return T;
+  case visc::GPU_TARGET:
+    if (T == visc::CPU_TARGET)
+      return visc::CPU_OR_GPU_TARGET;
+    if (T == visc::GPU_TARGET)
+      return visc::GPU_TARGET;
+    return T;
+  case visc::CPU_OR_GPU_TARGET:
+    return visc::CPU_OR_GPU_TARGET;
+  default:
+    assert(false && "Unknown Target\n");
   }
   return T;
 }
 
 // This functions add the hint as metadata in visc code
-void addHint(Function* F, visc::Target T) {
+void addHint(Function *F, visc::Target T) {
   // Get Module
-  Module* M = F->getParent();
+  Module *M = F->getParent();
   DEBUG(errs() << "Set preferred target for " << F->getName() << ": ");
 
   // Based on the hint, get the hint metadata
-  NamedMDNode* HintNode;
+  NamedMDNode *HintNode;
   switch (T) {
-    case visc::GPU_TARGET:
-      DEBUG(errs() << "GPU Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
-      break;
-    case visc::CPU_TARGET:
-      DEBUG(errs() << "CPU Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
-      break;
-    case visc::CPU_OR_GPU_TARGET:
-      DEBUG(errs() << "CPU or GPU Target\n");
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
-      break;
-    default:
-      llvm_unreachable("Unsupported Target Hint!");
-      break;
+  case visc::GPU_TARGET:
+    DEBUG(errs() << "GPU Target\n");
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+    break;
+  case visc::CPU_TARGET:
+    DEBUG(errs() << "CPU Target\n");
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+    break;
+  case visc::CPU_OR_GPU_TARGET:
+    DEBUG(errs() << "CPU or GPU Target\n");
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+    break;
+  default:
+    llvm_unreachable("Unsupported Target Hint!");
+    break;
   }
 
   // Create a node for the function and add it to the hint node
-  MDTuple* N = MDNode::get(M->getContext(), ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
+  MDTuple *N = MDNode::get(M->getContext(),
+                           ArrayRef<Metadata *>(ValueAsMetadata::get(F)));
   HintNode->addOperand(N);
 }
 
 // This function removes the hint as metadata in visc code
-void removeHint(Function* F, visc::Target T) {
+void removeHint(Function *F, visc::Target T) {
   // Get Module
-  Module* M = F->getParent();
-  DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T << "\n");
+  Module *M = F->getParent();
+  DEBUG(errs() << "Remove preferred target for " << F->getName() << ": " << T
+               << "\n");
 
   // Based on the hint, get the hint metadata
-  NamedMDNode* HintNode;
+  NamedMDNode *HintNode;
   switch (T) {
-    case visc::GPU_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
-      break;
-    case visc::CPU_OR_GPU_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
-      break;
-    case visc::CPU_TARGET:
-      HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
-      break;
-    default:
-      llvm_unreachable("Unsupported Target Hint!");
-      break;
+  case visc::GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_gpu");
+    break;
+  case visc::CPU_OR_GPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu_gpu");
+    break;
+  case visc::CPU_TARGET:
+    HintNode = M->getOrInsertNamedMetadata("visc_hint_cpu");
+    break;
+  default:
+    llvm_unreachable("Unsupported Target Hint!");
+    break;
   }
 
   // Gather metadata nodes, and keep those not associated with this function
-  MDNode* N = MDNode::get(M->getContext(),
-      ArrayRef<Metadata*>(ValueAsMetadata::get(F)));
-  std::vector<MDNode*> MDNodes;
+  MDNode *N = MDNode::get(M->getContext(),
+                          ArrayRef<Metadata *>(ValueAsMetadata::get(F)));
+  std::vector<MDNode *> MDNodes;
 
   for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-    MDNode* MDN = HintNode->getOperand(i);
+    MDNode *MDN = HintNode->getOperand(i);
     if (MDN == N) {
       continue;
     }
@@ -482,32 +499,34 @@ void removeHint(Function* F, visc::Target T) {
   for (unsigned i = 0; i < MDNodes.size(); i++) {
     HintNode->addOperand(MDNodes[i]);
   }
-
 }
 
-visc::Target getPreferredTarget(Function* F) {
+visc::Target getPreferredTarget(Function *F) {
   DEBUG(errs() << "Finding preferred target for " << F->getName() << "\n");
-  Module* M = F->getParent();
+  Module *M = F->getParent();
 
   auto FoundPrefTarget = [=](StringRef Name) {
-    NamedMDNode* HintNode = M->getOrInsertNamedMetadata(Name);
-    for(unsigned i = 0; i < HintNode->getNumOperands(); i++) {
-      MDNode* N = HintNode->getOperand(i);
-      Value* FHint = dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
-      if(F == FHint)
+    NamedMDNode *HintNode = M->getOrInsertNamedMetadata(Name);
+    for (unsigned i = 0; i < HintNode->getNumOperands(); i++) {
+      MDNode *N = HintNode->getOperand(i);
+      Value *FHint =
+          dyn_cast<ValueAsMetadata>(N->getOperand(0).get())->getValue();
+      if (F == FHint)
         return true;
     }
-    return false;	
+    return false;
   };
 
-  if(FoundPrefTarget("visc_hint_cpu")) return visc::CPU_TARGET;
-  if(FoundPrefTarget("visc_hint_gpu")) return visc::GPU_TARGET;
-  if(FoundPrefTarget("visc_hint_cpu_gpu")) return visc::CPU_OR_GPU_TARGET;
+  if (FoundPrefTarget("visc_hint_cpu"))
+    return visc::CPU_TARGET;
+  if (FoundPrefTarget("visc_hint_gpu"))
+    return visc::GPU_TARGET;
+  if (FoundPrefTarget("visc_hint_cpu_gpu"))
+    return visc::CPU_OR_GPU_TARGET;
 
   return visc::None;
 }
 
+} // namespace viscUtils
 
-} // End of namespace
-
-#endif //VISC_UTILS_HEADER
+#endif // VISC_UTILS_HEADER
diff --git a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
index dc5a044dd7efc852f6b547c120b271eeea6cc107..058419f1dc80a8650e7a3b834090a88099741431 100644
--- a/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
+++ b/hpvm/lib/Transforms/BuildDFG/BuildDFG.cpp
@@ -10,13 +10,13 @@
 #define DEBUG_TYPE "buildDFG"
 #include "BuildDFG/BuildDFG.h"
 
+#include "SupportVISC/VISCHint.h"
+#include "SupportVISC/VISCUtils.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Debug.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -26,15 +26,15 @@ bool BuildDFG::runOnModule(Module &M) {
   DEBUG(errs() << "\nBUILDDFG PASS\n");
   DEBUG(errs() << "-------- Searching for launch sites ----------\n");
 
-  IntrinsicInst* II;
+  IntrinsicInst *II;
 
   // Iterate over all functions in the module
   for (auto &Func : M) {
-    Function* F = &Func;
+    Function *F = &Func;
     DEBUG(errs() << "Function: " << F->getName() << "\n");
 
-    for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e ; ++i) {
-      Instruction* I = &*i; // Grab pointer to Instruction
+    for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+      Instruction *I = &*i; // Grab pointer to Instruction
       if (isViscLaunchIntrinsic(I)) {
         DEBUG(errs() << "------------ Found launch site --------------\n");
         II = cast<IntrinsicInst>(I);
@@ -42,24 +42,25 @@ bool BuildDFG::runOnModule(Module &M) {
         assert(II && "Launch intrinsic not recognized.");
 
         // Intrinsic Instruction has been initialized from this point on.
-        Function* F = cast<Function>(II->getOperand(0)->stripPointerCasts());
+        Function *F = cast<Function>(II->getOperand(0)->stripPointerCasts());
         Root = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F));
         Roots.push_back(Root);
         BuildGraph(Root, F);
 
-        for(DFGraph::children_iterator i = Root->getChildGraph()->begin(),
-            e = Root->getChildGraph()->end(); i!=e; i++) {
-          DFNode* N = *i;
+        for (DFGraph::children_iterator i = Root->getChildGraph()->begin(),
+                                        e = Root->getChildGraph()->end();
+             i != e; i++) {
+          DFNode *N = *i;
           DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n");
         }
         Root->getChildGraph()->sortChildren();
-        for(DFGraph::children_iterator i = Root->getChildGraph()->begin(),
-            e = Root->getChildGraph()->end(); i!=e; i++) {
-          DFNode* N = *i;
+        for (DFGraph::children_iterator i = Root->getChildGraph()->begin(),
+                                        e = Root->getChildGraph()->end();
+             i != e; i++) {
+          DFNode *N = *i;
           DEBUG(errs() << "\t" << N->getFuncPointer()->getName() << "\n");
         }
         viewDFGraph(Root->getChildGraph());
-
       }
     }
   }
@@ -75,85 +76,85 @@ DFInternalNode *BuildDFG::getRoot() const {
   return Root;
 }
 
-std::vector<DFInternalNode*> &BuildDFG::getRoots() {
+std::vector<DFInternalNode *> &BuildDFG::getRoots() {
   assert((Roots.size() != 0) && "Number of roots cannot be zero.");
-  
+
   // All roots should have the same level
- for(auto *Node : Roots) 
-   assert(Node->getLevel() == 0 && "Invalid root node.");
+  for (auto *Node : Roots)
+    assert(Node->getLevel() == 0 && "Invalid root node.");
 
   return Roots;
 }
 
-//TODO: Maybe make this const
+// TODO: Maybe make this const
 BuildDFG::HandleToDFNode &BuildDFG::getHandleToDFNodeMap() {
   return HandleToDFNodeMap;
 }
 
-//TODO: Maybe make this const
+// TODO: Maybe make this const
 BuildDFG::HandleToDFEdge &BuildDFG::getHandleToDFEdgeMap() {
   return HandleToDFEdgeMap;
 }
 
-void BuildDFG::addElementToHandleToDFNodeMap(Value* V, DFNode* N) {
+void BuildDFG::addElementToHandleToDFNodeMap(Value *V, DFNode *N) {
   assert((HandleToDFNodeMap.find(V) == HandleToDFNodeMap.end()) &&
          "Attempted to insert duplicate key in HandleToDFNodeMap");
-  HandleToDFNodeMap.insert(std::pair<Value*, DFNode*>(V,N));
+  HandleToDFNodeMap.insert(std::pair<Value *, DFNode *>(V, N));
 }
 
-//TODO: check if the removed element was not there
-void BuildDFG::removeElementFromHandleToDFNodeMap(Value* V) {
+// TODO: check if the removed element was not there
+void BuildDFG::removeElementFromHandleToDFNodeMap(Value *V) {
   HandleToDFNodeMap.erase(V);
 }
 
-void BuildDFG::addElementToHandleToDFEdgeMap(Value* V, DFEdge* E) {
+void BuildDFG::addElementToHandleToDFEdgeMap(Value *V, DFEdge *E) {
   assert((HandleToDFEdgeMap.find(V) == HandleToDFEdgeMap.end()) &&
          "Attempted to insert duplicate key in HandleToDFEdgeMap");
-  HandleToDFEdgeMap.insert(std::pair<Value*, DFEdge*>(V,E));
+  HandleToDFEdgeMap.insert(std::pair<Value *, DFEdge *>(V, E));
 }
 
-//TODO: check if the removed element was not there
-void BuildDFG::removeElementFromHandleToDFEdgeMap(Value* V) {
+// TODO: check if the removed element was not there
+void BuildDFG::removeElementFromHandleToDFEdgeMap(Value *V) {
   HandleToDFEdgeMap.erase(V);
 }
 
 // Returns true if instruction I is a visc launch intrinsic, false otherwise
-bool BuildDFG::isViscLaunchIntrinsic(Instruction* I) {
-  if(!isa<IntrinsicInst>(I))
+bool BuildDFG::isViscLaunchIntrinsic(Instruction *I) {
+  if (!isa<IntrinsicInst>(I))
     return false;
-  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
   return (II->getCalledFunction()->getName()).equals("llvm.visc.launch");
 }
 
 // Returns true if instruction I is a visc graph intrinsic, false otherwise
-bool BuildDFG::isViscGraphIntrinsic(Instruction* I) {
-  if(!isa<IntrinsicInst>(I))
+bool BuildDFG::isViscGraphIntrinsic(Instruction *I) {
+  if (!isa<IntrinsicInst>(I))
     return false;
-  IntrinsicInst* II = cast<IntrinsicInst>(I);
-  return (II->getCalledFunction()->getName()).startswith("llvm.visc.create")
-         || (II->getCalledFunction()->getName()).startswith("llvm.visc.bind");
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
+  return (II->getCalledFunction()->getName()).startswith("llvm.visc.create") ||
+         (II->getCalledFunction()->getName()).startswith("llvm.visc.bind");
 }
 
 // Returns true if instruction I is a visc query intrinsic, false otherwise
-bool BuildDFG::isViscQueryIntrinsic(Instruction* I) {
-  if(!isa<IntrinsicInst>(I))
+bool BuildDFG::isViscQueryIntrinsic(Instruction *I) {
+  if (!isa<IntrinsicInst>(I))
     return false;
-  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
   return (II->getCalledFunction()->getName()).startswith("llvm.visc.get");
 }
 
 // Returns true if instruction I is a visc intrinsic, false otherwise
-bool BuildDFG::isViscIntrinsic(Instruction* I) {
-  if(!isa<IntrinsicInst>(I))
+bool BuildDFG::isViscIntrinsic(Instruction *I) {
+  if (!isa<IntrinsicInst>(I))
     return false;
-  IntrinsicInst* II = cast<IntrinsicInst>(I);
+  IntrinsicInst *II = cast<IntrinsicInst>(I);
   return (II->getCalledFunction()->getName()).startswith("llvm.visc");
 }
 
 // Two types are "congruent" if they are identical, or if they are both
 // pointer types with different pointee types and the same address space.
-bool BuildDFG::isTypeCongruent(Type* L, Type* R) {
-  if(L == R)
+bool BuildDFG::isTypeCongruent(Type *L, Type *R) {
+  if (L == R)
     return true;
   PointerType *PL = dyn_cast<PointerType>(L);
   PointerType *PR = dyn_cast<PointerType>(R);
@@ -163,15 +164,15 @@ bool BuildDFG::isTypeCongruent(Type* L, Type* R) {
 }
 
 // Handles all the createNodeXX visc intrinsics.
-void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) {
+void BuildDFG::handleCreateNode(DFInternalNode *N, IntrinsicInst *II) {
   bool isInternalNode = false;
 
-  Function* F = cast<Function>((II->getOperand(0))->stripPointerCasts());
+  Function *F = cast<Function>((II->getOperand(0))->stripPointerCasts());
 
   // Check if the function associated with this intrinsic is a leaf or
   // internal node
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-    Instruction* I = &*i; // Grab pointer to Instruction
+    Instruction *I = &*i; // Grab pointer to Instruction
     if (isViscGraphIntrinsic(I))
       isInternalNode = true;
   }
@@ -179,46 +180,49 @@ void BuildDFG::handleCreateNode(DFInternalNode* N, IntrinsicInst* II) {
   // Number of Dimensions would be equal to the (number of operands - 1) as
   // the first operand is the pointer to associated Function and the
   // remaining operands are the limits in each dimension.
-  unsigned numOfDim = II->getCalledFunction()->getFunctionType()->getNumParams()-1;
-  assert(numOfDim <= 3
-         && "Invalid number of dimensions for createNode intrinsic!");
-  std::vector<Value*> dimLimits;
+  unsigned numOfDim =
+      II->getCalledFunction()->getFunctionType()->getNumParams() - 1;
+  assert(numOfDim <= 3 &&
+         "Invalid number of dimensions for createNode intrinsic!");
+  std::vector<Value *> dimLimits;
   for (unsigned i = 1; i <= numOfDim; i++) {
     // The operands of II are same as the operands of the called
     // intrinsic. It has one extra operand at the end, which is the intrinsic
     // being called.
-    dimLimits.push_back(cast<Value> (II->getOperand(i)));
+    dimLimits.push_back(cast<Value>(II->getOperand(i)));
   }
 
-  if(isInternalNode) {
+  if (isInternalNode) {
     // Create Internal DFNode, add it to the map and recursively build its
     // dataflow graph
-    DFInternalNode* childDFNode = DFInternalNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+    DFInternalNode *childDFNode = DFInternalNode::Create(
+        II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
     BuildGraph(childDFNode, F);
-  }
-  else {
+  } else {
     // Create Leaf DFnode and add it to the map.
-    DFLeafNode* childDFNode = DFLeafNode::Create(II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
+    DFLeafNode *childDFNode = DFLeafNode::Create(
+        II, F, viscUtils::getPreferredTarget(F), N, numOfDim, dimLimits);
     N->addChildToDFGraph(childDFNode);
     HandleToDFNodeMap[II] = childDFNode;
   }
 }
 
-void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) {
+void BuildDFG::handleCreateEdge(DFInternalNode *N, IntrinsicInst *II) {
   // The DFNode structures must be in the map before the edge is processed
   HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0));
   assert(DFI != HandleToDFNodeMap.end());
   DFI = HandleToDFNodeMap.find(II->getOperand(1));
   assert(DFI != HandleToDFNodeMap.end());
 
-  DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)];
-  DFNode* DestDF = HandleToDFNodeMap[II->getOperand(1)];
+  DFNode *SrcDF = HandleToDFNodeMap[II->getOperand(0)];
+  DFNode *DestDF = HandleToDFNodeMap[II->getOperand(1)];
 
   bool EdgeType = !cast<ConstantInt>(II->getOperand(2))->isZero();
 
-  unsigned SourcePosition = cast<ConstantInt>(II->getOperand(3))->getZExtValue();
+  unsigned SourcePosition =
+      cast<ConstantInt>(II->getOperand(3))->getZExtValue();
   unsigned DestPosition = cast<ConstantInt>(II->getOperand(4))->getZExtValue();
 
   bool isStreaming = !cast<ConstantInt>(II->getOperand(5))->isZero();
@@ -227,27 +231,22 @@ void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) {
 
   // Get destination type
   FunctionType *FT = DestDF->getFuncPointer()->getFunctionType();
-  assert((FT->getNumParams() > DestPosition)
-         && "Invalid argument number for destination dataflow node!");
+  assert((FT->getNumParams() > DestPosition) &&
+         "Invalid argument number for destination dataflow node!");
   DestTy = FT->getParamType(DestPosition);
 
   // Get source type
-  StructType* OutTy = SrcDF->getOutputType();
-  assert((OutTy->getNumElements() > SourcePosition)
-         && "Invalid argument number for source dataflow node!");
+  StructType *OutTy = SrcDF->getOutputType();
+  assert((OutTy->getNumElements() > SourcePosition) &&
+         "Invalid argument number for source dataflow node!");
   SrcTy = OutTy->getElementType(SourcePosition);
 
   // check if the types are compatible
-  assert(isTypeCongruent(SrcTy, DestTy)
-         && "Source and destination type of edge do not match");
+  assert(isTypeCongruent(SrcTy, DestTy) &&
+         "Source and destination type of edge do not match");
 
-  DFEdge* newDFEdge = DFEdge::Create(SrcDF,
-                                     DestDF,
-                                     EdgeType,
-                                     SourcePosition,
-                                     DestPosition,
-                                     DestTy,
-                                     isStreaming);
+  DFEdge *newDFEdge = DFEdge::Create(SrcDF, DestDF, EdgeType, SourcePosition,
+                                     DestPosition, DestTy, isStreaming);
 
   HandleToDFEdgeMap[II] = newDFEdge;
 
@@ -255,43 +254,39 @@ void BuildDFG::handleCreateEdge (DFInternalNode* N, IntrinsicInst* II) {
   N->addEdgeToDFGraph(newDFEdge);
 }
 
-void BuildDFG::handleBindInput(DFInternalNode* N, IntrinsicInst* II) {
+void BuildDFG::handleBindInput(DFInternalNode *N, IntrinsicInst *II) {
   // The DFNode structures must be in the map before the edge is processed
   HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0));
   assert(DFI != HandleToDFNodeMap.end());
 
-  DFNode* SrcDF = N->getChildGraph()->getEntry();
-  DFNode* DestDF = HandleToDFNodeMap[II->getOperand(0)];
+  DFNode *SrcDF = N->getChildGraph()->getEntry();
+  DFNode *DestDF = HandleToDFNodeMap[II->getOperand(0)];
 
-  unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+  unsigned SourcePosition =
+      cast<ConstantInt>(II->getOperand(1))->getZExtValue();
   unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
 
   bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero();
-  
+
   // Get destination type
   FunctionType *FT = DestDF->getFuncPointer()->getFunctionType();
-  assert((FT->getNumParams() > DestPosition)
-         && "Invalid argument number for destination dataflow node!");
-  Type* DestTy = FT->getParamType(DestPosition);
+  assert((FT->getNumParams() > DestPosition) &&
+         "Invalid argument number for destination dataflow node!");
+  Type *DestTy = FT->getParamType(DestPosition);
 
   // Get source type
   FT = SrcDF->getFuncPointer()->getFunctionType();
-  assert((FT->getNumParams() > SourcePosition)
-         && "Invalid argument number for parent dataflow node!");
-  Type* SrcTy = FT->getParamType(SourcePosition);
+  assert((FT->getNumParams() > SourcePosition) &&
+         "Invalid argument number for parent dataflow node!");
+  Type *SrcTy = FT->getParamType(SourcePosition);
 
   // check if the types are compatible
-  assert(isTypeCongruent(SrcTy, DestTy)
-         && "Source and destination type of edge do not match");
+  assert(isTypeCongruent(SrcTy, DestTy) &&
+         "Source and destination type of edge do not match");
 
   // Add Binding as an edge between Entry and child Node
-  DFEdge* newDFEdge = DFEdge::Create(SrcDF,
-                                     DestDF,
-                                     false,
-                                     SourcePosition,
-                                     DestPosition,
-                                     DestTy,
-                                     isStreaming);
+  DFEdge *newDFEdge = DFEdge::Create(SrcDF, DestDF, false, SourcePosition,
+                                     DestPosition, DestTy, isStreaming);
 
   HandleToDFEdgeMap[II] = newDFEdge;
 
@@ -299,43 +294,39 @@ void BuildDFG::handleBindInput(DFInternalNode* N, IntrinsicInst* II) {
   N->addEdgeToDFGraph(newDFEdge);
 }
 
-void BuildDFG::handleBindOutput(DFInternalNode* N, IntrinsicInst* II) {
+void BuildDFG::handleBindOutput(DFInternalNode *N, IntrinsicInst *II) {
   // The DFNode structures must be in the map before the edge is processed
   HandleToDFNode::iterator DFI = HandleToDFNodeMap.find(II->getOperand(0));
   assert(DFI != HandleToDFNodeMap.end());
 
-  DFNode* SrcDF = HandleToDFNodeMap[II->getOperand(0)];
-  DFNode* DestDF = N->getChildGraph()->getExit();
+  DFNode *SrcDF = HandleToDFNodeMap[II->getOperand(0)];
+  DFNode *DestDF = N->getChildGraph()->getExit();
 
-  unsigned SourcePosition = cast<ConstantInt>(II->getOperand(1))->getZExtValue();
+  unsigned SourcePosition =
+      cast<ConstantInt>(II->getOperand(1))->getZExtValue();
   unsigned DestPosition = cast<ConstantInt>(II->getOperand(2))->getZExtValue();
 
   bool isStreaming = !cast<ConstantInt>(II->getOperand(3))->isZero();
-  
+
   // Get destination type
-  StructType* OutTy = DestDF->getOutputType();
-  assert((OutTy->getNumElements() > DestPosition)
-         && "Invalid argument number for destination parent dataflow node!");
-  Type* DestTy = OutTy->getElementType(DestPosition);
+  StructType *OutTy = DestDF->getOutputType();
+  assert((OutTy->getNumElements() > DestPosition) &&
+         "Invalid argument number for destination parent dataflow node!");
+  Type *DestTy = OutTy->getElementType(DestPosition);
 
   // Get source type
   OutTy = SrcDF->getOutputType();
-  assert((OutTy->getNumElements() > SourcePosition)
-         && "Invalid argument number for source dataflow node!");
-  Type* SrcTy = OutTy->getElementType(SourcePosition);
+  assert((OutTy->getNumElements() > SourcePosition) &&
+         "Invalid argument number for source dataflow node!");
+  Type *SrcTy = OutTy->getElementType(SourcePosition);
 
   // check if the types are compatible
-  assert(isTypeCongruent(SrcTy, DestTy)
-         && "Source and destination type of edge do not match");
+  assert(isTypeCongruent(SrcTy, DestTy) &&
+         "Source and destination type of edge do not match");
 
   // Add Binding as an edge between child and exit node
-  DFEdge* newDFEdge = DFEdge::Create(SrcDF,
-                                     DestDF,
-                                     false,
-                                     SourcePosition,
-                                     DestPosition,
-                                     DestTy,
-                                     isStreaming);
+  DFEdge *newDFEdge = DFEdge::Create(SrcDF, DestDF, false, SourcePosition,
+                                     DestPosition, DestTy, isStreaming);
 
   HandleToDFEdgeMap[II] = newDFEdge;
 
@@ -343,7 +334,7 @@ void BuildDFG::handleBindOutput(DFInternalNode* N, IntrinsicInst* II) {
   N->addEdgeToDFGraph(newDFEdge);
 }
 
-void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) {
+void BuildDFG::BuildGraph(DFInternalNode *N, Function *F) {
   DEBUG(errs() << "FUNCTION: " << F->getName() << "\n");
   // TODO: Place checks for valid visc functions. For example one of the
   // check can be that any function that contains visc dataflow graph
@@ -351,49 +342,55 @@ void BuildDFG::BuildGraph (DFInternalNode* N, Function *F) {
 
   // Iterate over all the instructions of a function and look for visc
   // intrinsics.
-  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e ; ++i) {
-    Instruction* I = &*i; // Grab pointer to Instruction
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &*i; // Grab pointer to Instruction
     DEBUG(errs() << *I << "\n");
-    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) {
-      DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": " << II->getCalledFunction()->getName()<<"\n");
-      switch(II->getIntrinsicID()) {
-        case Intrinsic::visc_createNode:
-        case Intrinsic::visc_createNode1D:
-        case Intrinsic::visc_createNode2D:
-        case Intrinsic::visc_createNode3D:
-	    handleCreateNode (N, II);
-	    break;
-        case Intrinsic::visc_createEdge:
-	    handleCreateEdge(N, II);
-	    break;
-        case Intrinsic::visc_bind_input:
-	    handleBindInput(N, II);
-	    break;
-       case Intrinsic::visc_bind_output:
-	    handleBindOutput(N, II);
-	    break;
-
-       //TODO: Reconsider launch within a dataflow graph (recursion?)
-       case Intrinsic::visc_wait:
-       case Intrinsic::visc_launch:
-	    DEBUG(errs() << "Error: Launch/wait intrinsic used within a dataflow graph\n\t" << *II << "\n");
-	    break;
-
-       default:
-	    DEBUG(errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t" << *II << "\n");
-	    break;
-     }   
-     continue;
-   }   
-   if(!isa<ReturnInst>(I) && !isa<CastInst>(I)) {
-     DEBUG(errs() << "Non-intrinsic instruction: " << *I << "\n");
-     llvm_unreachable("Found non-intrinsic instruction inside an internal node. Only return instruction is allowed!");
-   }
-  } 
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      DEBUG(errs() << "IntrinsicID = " << II->getIntrinsicID() << ": "
+                   << II->getCalledFunction()->getName() << "\n");
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::visc_createNode:
+      case Intrinsic::visc_createNode1D:
+      case Intrinsic::visc_createNode2D:
+      case Intrinsic::visc_createNode3D:
+        handleCreateNode(N, II);
+        break;
+      case Intrinsic::visc_createEdge:
+        handleCreateEdge(N, II);
+        break;
+      case Intrinsic::visc_bind_input:
+        handleBindInput(N, II);
+        break;
+      case Intrinsic::visc_bind_output:
+        handleBindOutput(N, II);
+        break;
+
+      // TODO: Reconsider launch within a dataflow graph (recursion?)
+      case Intrinsic::visc_wait:
+      case Intrinsic::visc_launch:
+        DEBUG(errs()
+              << "Error: Launch/wait intrinsic used within a dataflow graph\n\t"
+              << *II << "\n");
+        break;
+
+      default:
+        DEBUG(
+            errs() << "Error: Invalid VISC Intrinsic inside Internal node!\n\t"
+                   << *II << "\n");
+        break;
+      }
+      continue;
+    }
+    if (!isa<ReturnInst>(I) && !isa<CastInst>(I)) {
+      DEBUG(errs() << "Non-intrinsic instruction: " << *I << "\n");
+      llvm_unreachable("Found non-intrinsic instruction inside an internal "
+                       "node. Only return instruction is allowed!");
+    }
+  }
 }
 
 char BuildDFG::ID = 0;
-static RegisterPass<BuildDFG> X("buildDFG", "Hierarchical Dataflow Graph Builder Pass", false, false);
+static RegisterPass<BuildDFG>
+    X("buildDFG", "Hierarchical Dataflow Graph Builder Pass", false, false);
 
 } // End of namespace builddfg
-
diff --git a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
index e8e814f881b5066d86f60bd10ae5941eed9179d6..6dae9e6977d31a0b62a9fa903966ec10810a2f71 100644
--- a/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
+++ b/hpvm/lib/Transforms/ClearDFG/ClearDFG.cpp
@@ -8,17 +8,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "ClearDFG"
+#include "BuildDFG/BuildDFG.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
 #include "llvm/Support/Debug.h"
-#include "BuildDFG/BuildDFG.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 using namespace llvm;
 using namespace builddfg;
 
-//STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted");
+// STATISTIC(IntrinsicCounter, "Counts number of visc intrinsics greeted");
 
 namespace {
 
@@ -35,18 +35,14 @@ private:
 public:
   bool runOnModule(Module &M);
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<BuildDFG>();
-  }
-
-
+  void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<BuildDFG>(); }
 };
 
 // Visitor for Code generation traversal (tree traversal for now)
 class TreeTraversal : public DFNodeVisitor {
 
 private:
-  //Member variables
+  // Member variables
   Module &M;
   BuildDFG &DFG;
 
@@ -54,37 +50,43 @@ private:
   // extra index and dimension arguments. This map also serves to find out if
   // we already have an index and dim extended function copy or not (i.e.,
   // "Have we visited this function before?")
-  ValueMap<Function*, Function*> FMap;
-  DenseMap<DFNode*, CallInst*> CallMap;
+  ValueMap<Function *, Function *> FMap;
+  DenseMap<DFNode *, CallInst *> CallMap;
+
+  // Functions
+  void deleteNode(DFNode *N);
 
-  //Functions
-  void deleteNode(DFNode* N);
 public:
   // Constructor
-  TreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) { }
+  TreeTraversal(Module &_M, BuildDFG &_DFG) : M(_M), DFG(_DFG) {}
 
-  virtual void visit(DFInternalNode* N) {
+  virtual void visit(DFInternalNode *N) {
     // Follows a bottom-up approach for code generation.
     // First generate code for all the child nodes
-    for(DFGraph::children_iterator i = N->getChildGraph()->begin(),
-        e = N->getChildGraph()->end(); i != e; ++i) {
-      DFNode* child = *i;
+    for (DFGraph::children_iterator i = N->getChildGraph()->begin(),
+                                    e = N->getChildGraph()->end();
+         i != e; ++i) {
+      DFNode *child = *i;
       child->applyDFNodeVisitor(*this);
     }
-    DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Erasing Node (I) - " << N->getFuncPointer()->getName()
+                 << "\n");
     // Generate code for this internal node now. This way all the cloned
     // functions for children exist.
     deleteNode(N);
-    DEBUG(errs() << "\tDone - " << "\n");
-    //errs() << "DONE: Generating Code for Node (I) - " << N->getFuncPointer()->getName() << "\n";
+    DEBUG(errs() << "\tDone - "
+                 << "\n");
+    // errs() << "DONE: Generating Code for Node (I) - " <<
+    // N->getFuncPointer()->getName() << "\n";
   }
 
-  virtual void visit(DFLeafNode* N) {
-    DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName() << "\n");
+  virtual void visit(DFLeafNode *N) {
+    DEBUG(errs() << "Erasing Node (L) - " << N->getFuncPointer()->getName()
+                 << "\n");
     deleteNode(N);
-    DEBUG(errs() << "DONE" << "\n");
+    DEBUG(errs() << "DONE"
+                 << "\n");
   }
-
 };
 
 bool ClearDFG::runOnModule(Module &M) {
@@ -95,26 +97,28 @@ bool ClearDFG::runOnModule(Module &M) {
   BuildDFG &DFG = getAnalysis<BuildDFG>();
 
   // DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
-  Function* VI = M.getFunction("llvm.visc.init");
+  Function *VI = M.getFunction("llvm.visc.init");
   assert(VI->hasOneUse() && "More than one use of llvm.visc.init\n");
-  for(Value::user_iterator ui = VI->user_begin(), ue = VI->user_end(); ui != ue; ui++) {
-    Instruction* I = dyn_cast<Instruction>(*ui);
+  for (Value::user_iterator ui = VI->user_begin(), ue = VI->user_end();
+       ui != ue; ui++) {
+    Instruction *I = dyn_cast<Instruction>(*ui);
     I->eraseFromParent();
   }
   VI->replaceAllUsesWith(UndefValue::get(VI->getType()));
   VI->eraseFromParent();
 
-  Function* VC = M.getFunction("llvm.visc.cleanup");
+  Function *VC = M.getFunction("llvm.visc.cleanup");
   assert(VC->hasOneUse() && "More than one use of llvm.visc.cleanup\n");
-  for(Value::user_iterator ui = VC->user_begin(), ue = VC->user_end(); ui != ue; ui++) {
-    Instruction* I = dyn_cast<Instruction>(*ui);
+  for (Value::user_iterator ui = VC->user_begin(), ue = VC->user_end();
+       ui != ue; ui++) {
+    Instruction *I = dyn_cast<Instruction>(*ui);
     I->eraseFromParent();
   }
-    
+
   VC->replaceAllUsesWith(UndefValue::get(VC->getType()));
   VC->eraseFromParent();
 
@@ -122,25 +126,25 @@ bool ClearDFG::runOnModule(Module &M) {
   TreeTraversal *Visitor = new TreeTraversal(M, DFG);
 
   // Initiate code generation for root DFNode
-  for (auto rootNode: Roots) {
+  for (auto rootNode : Roots) {
     Visitor->visit(rootNode);
   }
   delete Visitor;
   return true;
 }
 
-void TreeTraversal::deleteNode(DFNode* N) {
-  if(N->isDummyNode())
+void TreeTraversal::deleteNode(DFNode *N) {
+  if (N->isDummyNode())
     return;
   // Erase Function associated with this node
-  Function* F = N->getFuncPointer();
+  Function *F = N->getFuncPointer();
   F->replaceAllUsesWith(UndefValue::get(F->getType()));
   F->eraseFromParent();
   // If N is not a root node, we are done. Return.
-  if(!N->isRoot())
+  if (!N->isRoot())
     return;
   // N is a root node. Delete the Launch Intrinsic associated it with as well.
-  IntrinsicInst* LI = N->getInstruction();
+  IntrinsicInst *LI = N->getInstruction();
   LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
   LI->eraseFromParent();
 }
@@ -148,8 +152,7 @@ void TreeTraversal::deleteNode(DFNode* N) {
 } // End of namespace
 
 char ClearDFG::ID = 0;
-static RegisterPass<ClearDFG> X("clearDFG",
-                                    "Delete all DFG functions for which code has been generated",
-                                    false /* does not modify the CFG */,
-                                    true /* transformation, not just analysis */);
-
+static RegisterPass<ClearDFG>
+    X("clearDFG", "Delete all DFG functions for which code has been generated",
+      false /* does not modify the CFG */,
+      true /* transformation, not just analysis */);
diff --git a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
index 08f6314a812844f85e7fe6d5ce50cf6e8393a2e0..c9ce98cb7230cc694d50303eeff8f007a24aecdd 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_NVPTX/DFG2LLVM_NVPTX.cpp
@@ -15,29 +15,28 @@
 #define SHARED_ADDRSPACE 3
 
 #define DEBUG_TYPE "DFG2LLVM_NVPTX"
+#include "SupportVISC/DFG2LLVM.h"
+#include "SupportVISC/VISCTimer.h"
+#include "SupportVISC/VISCUtils.h"
+#include "llvm-c/Core.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm-c/Core.h"
-#include "SupportVISC/VISCTimer.h"
-#include "SupportVISC/DFG2LLVM.h"
-#include "SupportVISC/VISCUtils.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LegacyPassManager.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/IR/UseListOrder.h"
-
+#include "llvm/Support/ToolOutputFile.h"
 
 #include <sstream>
 
@@ -47,8 +46,8 @@ using namespace dfg2llvm;
 using namespace viscUtils;
 
 // VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer_NVPTX("visc-timers-ptx", cl::desc("Enable visc timers"));
+static cl::opt<bool> VISCTimer_NVPTX("visc-timers-ptx",
+                                     cl::desc("Enable visc timers"));
 
 namespace {
 // Helper class declarations
@@ -57,94 +56,88 @@ namespace {
 // in bytes. Would have preferred to use tuple but support not yet available
 class OutputPtr {
 public:
-  OutputPtr(Value* _h_ptr, Value* _d_ptr, Value* _bytes)
-    : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
+  OutputPtr(Value *_h_ptr, Value *_d_ptr, Value *_bytes)
+      : h_ptr(_h_ptr), d_ptr(_d_ptr), bytes(_bytes) {}
 
-  Value* h_ptr;
-  Value* d_ptr;
-  Value* bytes;
+  Value *h_ptr;
+  Value *d_ptr;
+  Value *bytes;
 };
 
 // Class to maintain important kernel info required for generating runtime
 // calls
 class Kernel {
 public:
-  Kernel(Function* _KF, DFLeafNode* _KLeafNode, std::map<unsigned, unsigned> _inArgMap =
-           std::map<unsigned, unsigned>(),
-         std::map<unsigned, std::pair<Value*, unsigned> > _sharedInArgMap =
-           std::map<unsigned, std::pair<Value*, unsigned> >(),
-         std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
-         unsigned _gridDim = 0, std::vector<Value*> _globalWGSize = std::vector<Value*>(),
-         unsigned _blockDim = 0, std::vector<Value*> _localWGSize = std::vector<Value*>())
-    : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
-      sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap), gridDim(_gridDim),
-      globalWGSize(_globalWGSize), blockDim(_blockDim), localWGSize(_localWGSize) {
-
-    assert(gridDim == globalWGSize.size()
-           && "gridDim should be same as the size of vector globalWGSize");
-    assert(blockDim == localWGSize.size()
-           && "blockDim should be same as the size of vector localWGSize");
+  Kernel(
+      Function *_KF, DFLeafNode *_KLeafNode,
+      std::map<unsigned, unsigned> _inArgMap = std::map<unsigned, unsigned>(),
+      std::map<unsigned, std::pair<Value *, unsigned>> _sharedInArgMap =
+          std::map<unsigned, std::pair<Value *, unsigned>>(),
+      std::vector<unsigned> _outArgMap = std::vector<unsigned>(),
+      unsigned _gridDim = 0,
+      std::vector<Value *> _globalWGSize = std::vector<Value *>(),
+      unsigned _blockDim = 0,
+      std::vector<Value *> _localWGSize = std::vector<Value *>())
+      : KernelFunction(_KF), KernelLeafNode(_KLeafNode), inArgMap(_inArgMap),
+        sharedInArgMap(_sharedInArgMap), outArgMap(_outArgMap),
+        gridDim(_gridDim), globalWGSize(_globalWGSize), blockDim(_blockDim),
+        localWGSize(_localWGSize) {
+
+    assert(gridDim == globalWGSize.size() &&
+           "gridDim should be same as the size of vector globalWGSize");
+    assert(blockDim == localWGSize.size() &&
+           "blockDim should be same as the size of vector localWGSize");
   }
 
-  Function* KernelFunction;
-  DFLeafNode* KernelLeafNode;
+  Function *KernelFunction;
+  DFLeafNode *KernelLeafNode;
   std::map<unsigned, unsigned> inArgMap;
   // Map for shared memory arguments
-  std::map<unsigned, std::pair<Value*, unsigned> > sharedInArgMap;
+  std::map<unsigned, std::pair<Value *, unsigned>> sharedInArgMap;
   // Fields for (potential) allocation node
-  DFLeafNode* AllocationNode;
-  Function* AllocationFunction;
+  DFLeafNode *AllocationNode;
+  Function *AllocationFunction;
   std::map<unsigned, unsigned> allocInArgMap;
 
   std::vector<unsigned> outArgMap;
   unsigned gridDim;
-  std::vector<Value*> globalWGSize;
+  std::vector<Value *> globalWGSize;
   unsigned blockDim;
-  std::vector<Value*> localWGSize;
+  std::vector<Value *> localWGSize;
   std::vector<int> localDimMap;
 
-  std::map<unsigned, unsigned> &getInArgMap() {
-    return inArgMap;
-  }
-  void setInArgMap(std::map<unsigned, unsigned> map) {
-    inArgMap = map;
-  }
+  std::map<unsigned, unsigned> &getInArgMap() { return inArgMap; }
+  void setInArgMap(std::map<unsigned, unsigned> map) { inArgMap = map; }
 
-  std::map<unsigned, std::pair<Value*, unsigned> > &getSharedInArgMap() {
+  std::map<unsigned, std::pair<Value *, unsigned>> &getSharedInArgMap() {
     return sharedInArgMap;
   }
-  void setSharedInArgMap(std::map<unsigned, std::pair<Value*, unsigned> > map) {
+  void setSharedInArgMap(std::map<unsigned, std::pair<Value *, unsigned>> map) {
     sharedInArgMap = map;
   }
 
-  std::vector<unsigned> &getOutArgMap() {
-    return outArgMap;
-  }
-  void setOutArgMap(std::vector<unsigned> map) {
-    outArgMap = map;
-  }
+  std::vector<unsigned> &getOutArgMap() { return outArgMap; }
+  void setOutArgMap(std::vector<unsigned> map) { outArgMap = map; }
 
-  void setLocalWGSize(std::vector<Value*> V) {
-    localWGSize = V;
-  }
+  void setLocalWGSize(std::vector<Value *> V) { localWGSize = V; }
 
-  bool hasLocalWG() const {
-    return blockDim != 0;
-  }
+  bool hasLocalWG() const { return blockDim != 0; }
 };
 
 // Helper function declarations
-static bool canBePromoted(Argument* arg, Function* F);
-static void getExecuteNodeParams(Module &M, Value* &, Value* &, Value* &, Kernel*,
-                                 ValueToValueMapTy&, Instruction*);
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*>, ValueToValueMapTy&,
-                              Instruction*, const Twine& WGName = "WGSize");
-static std::string getPTXFilename(const Module&);
-static std::string getFilenameFromModule(const Module& M);
+static bool canBePromoted(Argument *arg, Function *F);
+static void getExecuteNodeParams(Module &M, Value *&, Value *&, Value *&,
+                                 Kernel *, ValueToValueMapTy &, Instruction *);
+static Value *genWorkGroupPtr(Module &M, std::vector<Value *>,
+                              ValueToValueMapTy &, Instruction *,
+                              const Twine &WGName = "WGSize");
+static std::string getPTXFilename(const Module &);
+static std::string getFilenameFromModule(const Module &M);
 static void changeDataLayout(Module &);
 static void changeTargetTriple(Module &);
 static void findReturnInst(Function *, std::vector<ReturnInst *> &);
-static void findIntrinsicInst(Function *, Intrinsic::ID, std::vector<IntrinsicInst *> &);
+static void findIntrinsicInst(Function *, Intrinsic::ID,
+                              std::vector<IntrinsicInst *> &);
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID);
 static std::string getAtomicOpName(Intrinsic::ID);
 
@@ -154,7 +147,6 @@ struct DFG2LLVM_NVPTX : public DFG2LLVM {
   DFG2LLVM_NVPTX() : DFG2LLVM(ID) {}
 
 private:
-
 public:
   bool runOnModule(Module &M);
 };
@@ -163,10 +155,10 @@ public:
 class CGT_NVPTX : public CodeGenTraversal {
 
 private:
-  //Member variables
+  // Member variables
   std::unique_ptr<Module> KernelM;
-  DFNode* KernelLaunchNode = NULL;
-  Kernel* kernel;
+  DFNode *KernelLaunchNode = NULL;
+  Kernel *kernel;
 
   // VISC Runtime API
   FunctionCallee llvm_visc_ocl_launch;
@@ -181,14 +173,16 @@ private:
   FunctionCallee llvm_visc_ocl_getOutput;
   FunctionCallee llvm_visc_ocl_executeNode;
 
-  //Functions
+  // Functions
   std::string getKernelsModuleName(Module &M);
-  void fixValueAddrspace(Value* V, unsigned addrspace);
-  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned>*, Function*);
-  Function* changeArgAddrspace(Function* F, std::vector<unsigned> &Ags, unsigned i);
-  void addCLMetadata(Function* F);
-  Function* transformFunctionToVoid(Function* F);
-  void insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName);
+  void fixValueAddrspace(Value *V, unsigned addrspace);
+  std::vector<unsigned> globalToConstantMemoryOpt(std::vector<unsigned> *,
+                                                  Function *);
+  Function *changeArgAddrspace(Function *F, std::vector<unsigned> &Ags,
+                               unsigned i);
+  void addCLMetadata(Function *F);
+  Function *transformFunctionToVoid(Function *F);
+  void insertRuntimeCalls(DFInternalNode *N, Kernel *K, const Twine &FileName);
 
   // Virtual Functions
   void init() {
@@ -196,24 +190,25 @@ private:
     TargetName = "NVPTX";
   }
   void initRuntimeAPI();
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
+  void codeGen(DFInternalNode *N);
+  void codeGen(DFLeafNode *N);
 
 public:
-
   // Constructor
-  CGT_NVPTX(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
+  CGT_NVPTX(Module &_M, BuildDFG &_DFG)
+      : CodeGenTraversal(_M, _DFG), KernelM(CloneModule(_M)) {
     init();
     initRuntimeAPI();
     DEBUG(errs() << "Old module pointer: " << &_M << "\n");
-    DEBUG(errs() << "New module pointer: " <<  KernelM.get() << "\n");
+    DEBUG(errs() << "New module pointer: " << KernelM.get() << "\n");
 
-    // Copying instead of creating new, in order to preserve required info (metadata)
-    // Remove functions, global variables and aliases
-    std::vector<GlobalVariable*> GVVect;
+    // Copying instead of creating new, in order to preserve required info
+    // (metadata) Remove functions, global variables and aliases
+    std::vector<GlobalVariable *> GVVect;
     for (Module::global_iterator mi = KernelM->global_begin(),
-         me = KernelM->global_end(); (mi != me); ++mi) {
-      GlobalVariable* GV = &*mi;
+                                 me = KernelM->global_end();
+         (mi != me); ++mi) {
+      GlobalVariable *GV = &*mi;
       GVVect.push_back(GV);
     }
     for (auto *GV : GVVect) {
@@ -221,10 +216,10 @@ public:
       GV->eraseFromParent();
     }
 
-    std::vector<Function*> FuncVect;
-    for (Module::iterator mi = KernelM->begin(),
-         me = KernelM->end(); (mi != me); ++mi) {
-      Function* F = &*mi;
+    std::vector<Function *> FuncVect;
+    for (Module::iterator mi = KernelM->begin(), me = KernelM->end();
+         (mi != me); ++mi) {
+      Function *F = &*mi;
       FuncVect.push_back(F);
     }
     for (auto *F : FuncVect) {
@@ -232,10 +227,11 @@ public:
       F->eraseFromParent();
     }
 
-    std::vector<GlobalAlias*> GAVect;
+    std::vector<GlobalAlias *> GAVect;
     for (Module::alias_iterator mi = KernelM->alias_begin(),
-         me = KernelM->alias_end(); (mi != me); ++mi) {
-      GlobalAlias* GA = &*mi;
+                                me = KernelM->alias_end();
+         (mi != me); ++mi) {
+      GlobalAlias *GA = &*mi;
       GAVect.push_back(GA);
     }
     for (auto *GA : GAVect) {
@@ -246,9 +242,7 @@ public:
     changeDataLayout(*KernelM);
     changeTargetTriple(*KernelM);
 
-
     DEBUG(errs() << *KernelM);
-
   }
 
   void writeKernelsModule();
@@ -260,14 +254,14 @@ void CGT_NVPTX::initRuntimeAPI() {
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
   Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll";
 
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
-  if(runtimeModule == nullptr)
+  if (runtimeModule == nullptr)
     DEBUG(errs() << Err.getMessage());
   else
     DEBUG(errs() << "Successfully loaded visc-rt API module\n");
@@ -290,27 +284,25 @@ void CGT_NVPTX::initRuntimeAPI() {
 
   // Insert init context in main
   DEBUG(errs() << "Gen Code to initialize NVPTX Timer\n");
-  Function* VI = M.getFunction("llvm.visc.init");
+  Function *VI = M.getFunction("llvm.visc.init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
 
   InitCall = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(InitCall);
   switchToTimer(visc_TimerID_INIT_CTX, InitCall);
   CallInst::Create(llvm_visc_ocl_initContext,
-                   ArrayRef<Value*>(getTargetID(M, visc::GPU_TARGET)),
-                   "", InitCall);
+                   ArrayRef<Value *>(getTargetID(M, visc::GPU_TARGET)), "",
+                   InitCall);
   switchToTimer(visc_TimerID_NONE, InitCall);
 
   // Insert print instruction at visc exit
   DEBUG(errs() << "Gen Code to print NVPTX Timer\n");
-  Function* VC = M.getFunction("llvm.visc.cleanup");
+  Function *VC = M.getFunction("llvm.visc.cleanup");
   DEBUG(errs() << *VC << "\n");
   assert(VC->getNumUses() == 1 && "__visc__clear should only be used once");
 
   CleanupCall = cast<Instruction>(*VC->user_begin());
   printTimerSet(CleanupCall);
-
-
 }
 
 // Generate Code to call the kernel
@@ -318,36 +310,37 @@ void CGT_NVPTX::initRuntimeAPI() {
 // used to generate a function to associate with this leaf node. The function
 // is responsible for all the memory allocation/transfer and invoking the
 // kernel call on the device
-void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& FileName) {
+void CGT_NVPTX::insertRuntimeCalls(DFInternalNode *N, Kernel *K,
+                                   const Twine &FileName) {
   // Check if clone already exists. If it does, it means we have visited this
   // function before.
-//  assert(N->getGenFunc() == NULL && "Code already generated for this node");
+  //  assert(N->getGenFunc() == NULL && "Code already generated for this node");
 
   assert(N->getGenFuncForTarget(visc::GPU_TARGET) == NULL &&
          "Code already generated for this node");
 
   // Useful values
-  Value* True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
-  Value* False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
+  Value *True = ConstantInt::get(Type::getInt1Ty(M.getContext()), 1);
+  Value *False = ConstantInt::get(Type::getInt1Ty(M.getContext()), 0);
 
   // If kernel struct has not been initialized with kernel function, then fail
   assert(K != NULL && "No kernel found!!");
 
   DEBUG(errs() << "Generating kernel call code\n");
 
-  Function* F = N->getFuncPointer();
-
+  Function *F = N->getFuncPointer();
 
   // Create of clone of F with no instructions. Only the type is the same as F
   // without the extra arguments.
-  Function* F_X86;
+  Function *F_X86;
 
   // Clone the function, if we are seeing this function for the first time. We
   // only need a clone in terms of type.
   ValueToValueMapTy VMap;
 
   // Create new function with the same type
-  F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+  F_X86 =
+      Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
 
   // Loop over the arguments, copying the names of arguments over.
   Function::arg_iterator dest_iterator = F_X86->arg_begin();
@@ -360,27 +353,26 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 
   // Add a basic block to this empty function
   BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F_X86);
-  ReturnInst* RI = ReturnInst::Create(M.getContext(),
-                                      UndefValue::get(F_X86->getReturnType()), BB);
+  ReturnInst *RI = ReturnInst::Create(
+      M.getContext(), UndefValue::get(F_X86->getReturnType()), BB);
 
   // FIXME: Adding Index and Dim arguments are probably not required except
   // for consistency purpose (DFG2LLVM_X86 does assume that all leaf nodes do
   // have those arguments)
 
   // Add Index and Dim arguments except for the root node
-  if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+  if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
     F_X86 = addIdxDimArgs(F_X86);
 
   BB = &*F_X86->begin();
   RI = cast<ReturnInst>(BB->getTerminator());
 
-  //Add the generated function info to DFNode
-//  N->setGenFunc(F_X86, visc::CPU_TARGET);
+  // Add the generated function info to DFNode
+  //  N->setGenFunc(F_X86, visc::CPU_TARGET);
   N->addGenFunc(F_X86, visc::GPU_TARGET, true);
   DEBUG(errs() << "Added GPUGenFunc: " << F_X86->getName() << " for node "
                << N->getFuncPointer()->getName() << "\n");
 
-
   // Loop over the arguments, to create the VMap
   dest_iterator = F_X86->arg_begin();
   for (Function::const_arg_iterator i = F->arg_begin(), e = F->arg_end();
@@ -412,51 +404,53 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       break;
   }
 
-  assert(C->isDummyNode() == false && "Internal Node only contains dummy nodes!");
+  assert(C->isDummyNode() == false && "Internal Node only contains dummy
+  nodes!");
 
   Function* CF = C->getFuncPointer();
   */
-  Function* KF = K->KernelLeafNode->getFuncPointer();
+  Function *KF = K->KernelLeafNode->getFuncPointer();
   // Initialize context
-  //DEBUG(errs() << "Initializing context" << "\n");
-  //CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
+  // DEBUG(errs() << "Initializing context" << "\n");
+  // CallInst::Create(llvm_visc_ocl_initContext, None, "", RI);
 
-  DEBUG(errs() << "Initializing commandQ" << "\n");
+  DEBUG(errs() << "Initializing commandQ"
+               << "\n");
   // Initialize command queue
   switchToTimer(visc_TimerID_SETUP, InitCall);
-  Value* fileStr = getStringPointer(FileName, InitCall, "Filename");
+  Value *fileStr = getStringPointer(FileName, InitCall, "Filename");
   DEBUG(errs() << "Kernel Filename constant: " << *fileStr << "\n");
-  DEBUG(errs() << "Generating code for kernel - " << K->KernelFunction->getName()<< "\n");
-  Value* kernelStr = getStringPointer(K->KernelFunction->getName(), InitCall,"KernelName");
-
-  Value* LaunchInstArgs[] = {fileStr, kernelStr};
-
-  DEBUG(errs() << "Inserting launch call" << "\n");
-  CallInst* NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch,
-                                         ArrayRef<Value*>(LaunchInstArgs, 2),
-                                         "graph"+KF->getName(),
-                                         InitCall);
+  DEBUG(errs() << "Generating code for kernel - "
+               << K->KernelFunction->getName() << "\n");
+  Value *kernelStr =
+      getStringPointer(K->KernelFunction->getName(), InitCall, "KernelName");
+
+  Value *LaunchInstArgs[] = {fileStr, kernelStr};
+
+  DEBUG(errs() << "Inserting launch call"
+               << "\n");
+  CallInst *NVPTX_Ctx = CallInst::Create(llvm_visc_ocl_launch,
+                                         ArrayRef<Value *>(LaunchInstArgs, 2),
+                                         "graph" + KF->getName(), InitCall);
   DEBUG(errs() << *NVPTX_Ctx << "\n");
-  GraphIDAddr = new GlobalVariable(M,
-                                   NVPTX_Ctx->getType(),
-                                   false,
+  GraphIDAddr = new GlobalVariable(M, NVPTX_Ctx->getType(), false,
                                    GlobalValue::CommonLinkage,
                                    Constant::getNullValue(NVPTX_Ctx->getType()),
-                                   "graph"+KF->getName()+".addr");
+                                   "graph" + KF->getName() + ".addr");
   DEBUG(errs() << "Store at: " << *GraphIDAddr << "\n");
-  StoreInst* SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
+  StoreInst *SI = new StoreInst(NVPTX_Ctx, GraphIDAddr, InitCall);
   DEBUG(errs() << *SI << "\n");
   switchToTimer(visc_TimerID_NONE, InitCall);
   switchToTimer(visc_TimerID_SETUP, RI);
-  Value* GraphID = new LoadInst(GraphIDAddr, "graph."+KF->getName(), RI);
+  Value *GraphID = new LoadInst(GraphIDAddr, "graph." + KF->getName(), RI);
 
   // Iterate over the required input edges of the node and use the visc-rt API
   // to set inputs
   DEBUG(errs() << "Iterate over input edges of node and insert visc api\n");
   std::vector<OutputPtr> OutputPointers;
-  // Vector to hold the device memory object that need to be cleared before we release
-  // context
-  std::vector<Value*> DevicePointers;
+  // Vector to hold the device memory object that need to be cleared before we
+  // release context
+  std::vector<Value *> DevicePointers;
 
   std::map<unsigned, unsigned> &kernelInArgMap = K->getInArgMap();
   /*
@@ -468,133 +462,134 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
 
   */
 
-  for(auto &InArgMapPair  : kernelInArgMap) {
+  for (auto &InArgMapPair : kernelInArgMap) {
     unsigned i = InArgMapPair.first;
-    Value* inputVal = getArgumentAt(F_X86, InArgMapPair.second);
-    DEBUG(errs() << "\tArgument "<< i<< " = "  << *inputVal << "\n");
+    Value *inputVal = getArgumentAt(F_X86, InArgMapPair.second);
+    DEBUG(errs() << "\tArgument " << i << " = " << *inputVal << "\n");
 
     // input value has been obtained.
     // Check if input is a scalar value or a pointer operand
     // For scalar values such as int, float, etc. the size is simply the size of
     // type on target machine, but for pointers, the size of data would be the
     // next integer argument
-    if(inputVal->getType()->isPointerTy()) {
+    if (inputVal->getType()->isPointerTy()) {
 
       switchToTimer(visc_TimerID_COPY_PTR, RI);
       // Pointer Input
       // CheckAttribute
-      Value* isOutput = (hasAttribute(KF, i, Attribute::Out))? True : False;
-      Value* isInput = ((hasAttribute(KF, i, Attribute::Out))
-                        && !(hasAttribute(KF, i, Attribute::In)))? False : True;
-
-      Argument* A = getArgumentAt(KF, i);
-      if(isOutput == True) {
+      Value *isOutput = (hasAttribute(KF, i, Attribute::Out)) ? True : False;
+      Value *isInput = ((hasAttribute(KF, i, Attribute::Out)) &&
+                        !(hasAttribute(KF, i, Attribute::In)))
+                           ? False
+                           : True;
+
+      Argument *A = getArgumentAt(KF, i);
+      if (isOutput == True) {
         DEBUG(errs() << *A << " is an OUTPUT argument\n");
       }
-      if(isInput == True) {
+      if (isInput == True) {
         DEBUG(errs() << *A << " is an INPUT argument\n");
       }
 
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputVal,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
+      Value *inputValI8Ptr = CastInst::CreatePointerCast(
+          inputVal, Type::getInt8PtrTy(M.getContext()),
+          inputVal->getName() + ".i8ptr", RI);
 
       // Assert that the pointer argument size (next argument) is in the map
-      assert(kernelInArgMap.find(i+1) != kernelInArgMap.end());
-
-      Value* inputSize = getArgumentAt(F_X86, kernelInArgMap[i+1]);
-      assert(inputSize->getType() == Type::getInt64Ty(M.getContext())
-             && "Pointer type input must always be followed by size (integer type)");
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               inputSize,
-                               isInput,
-                               isOutput
-                              };
-      Value* d_ptr = CallInst::Create(llvm_visc_ocl_argument_ptr,
-                                      ArrayRef<Value*>(setInputArgs, 6), "", RI);
+      assert(kernelInArgMap.find(i + 1) != kernelInArgMap.end());
+
+      Value *inputSize = getArgumentAt(F_X86, kernelInArgMap[i + 1]);
+      assert(
+          inputSize->getType() == Type::getInt64Ty(M.getContext()) &&
+          "Pointer type input must always be followed by size (integer type)");
+      Value *setInputArgs[] = {
+          GraphID,
+          inputValI8Ptr,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
+          inputSize,
+          isInput,
+          isOutput};
+      Value *d_ptr =
+          CallInst::Create(llvm_visc_ocl_argument_ptr,
+                           ArrayRef<Value *>(setInputArgs, 6), "", RI);
       DevicePointers.push_back(d_ptr);
       // If this has out attribute, store the returned device pointer in
       // memory to read device memory later
-      if(isOutput == True) OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
-    }
-    else {
+      if (isOutput == True)
+        OutputPointers.push_back(OutputPtr(inputValI8Ptr, d_ptr, inputSize));
+    } else {
       switchToTimer(visc_TimerID_COPY_SCALAR, RI);
       // Scalar Input
       // Store the scalar value on stack and then pass the pointer to its
       // location
-      AllocaInst* inputValPtr = new AllocaInst(inputVal->getType(), 0, inputVal->getName()+".ptr", RI);
-      StoreInst* SI = new StoreInst(inputVal, inputValPtr, RI);
-
-      Value* inputValI8Ptr = CastInst::CreatePointerCast(inputValPtr,
-                             Type::getInt8PtrTy(M.getContext()),
-                             inputVal->getName()+".i8ptr",
-                             RI);
-
-      Value* setInputArgs[] = {GraphID,
-                               inputValI8Ptr,
-                               ConstantInt::get(Type::getInt32Ty(M.getContext()),i),
-                               ConstantExpr::getSizeOf(inputVal->getType())
-                              };
+      AllocaInst *inputValPtr = new AllocaInst(
+          inputVal->getType(), 0, inputVal->getName() + ".ptr", RI);
+      StoreInst *SI = new StoreInst(inputVal, inputValPtr, RI);
+
+      Value *inputValI8Ptr = CastInst::CreatePointerCast(
+          inputValPtr, Type::getInt8PtrTy(M.getContext()),
+          inputVal->getName() + ".i8ptr", RI);
+
+      Value *setInputArgs[] = {
+          GraphID, inputValI8Ptr,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), i),
+          ConstantExpr::getSizeOf(inputVal->getType())};
       CallInst::Create(llvm_visc_ocl_argument_scalar,
-                       ArrayRef<Value*>(setInputArgs, 4), "", RI);
+                       ArrayRef<Value *>(setInputArgs, 4), "", RI);
     }
   }
 
-  DEBUG(errs() << "Setup shared memory arguments of node and insert visc api\n");
+  DEBUG(
+      errs() << "Setup shared memory arguments of node and insert visc api\n");
 
   // Check to see if all the allocation sizes are constant (determined
   // statically)
   bool constSizes = true;
-  for (auto& e: K->getSharedInArgMap()) {
+  for (auto &e : K->getSharedInArgMap()) {
     constSizes &= isa<Constant>(e.second.first);
   }
 
   // If the sizes are all constant
   if (constSizes) {
-    for (auto& e: K->getSharedInArgMap()) {
+    for (auto &e : K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value* allocSize = e.second.first;
+      Value *allocSize = e.second.first;
 
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at " << argNum
+                   << ", size = " << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
 
-        assert(isa<Constant>(allocSize) && "Constant shared memory size is expected");
+        assert(isa<Constant>(allocSize) &&
+               "Constant shared memory size is expected");
 
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
+        Value *setInputArgs[] = {
+            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            allocSize};
         CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
+                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
+      } else {
         // Sharem memory size argument - scalar at address position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0,
-            allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                                Type::getInt8PtrTy(M.getContext()),
-                                allocSize->getName()+".sharedMem.i8ptr",
-                                RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
+        AllocaInst *allocSizePtr =
+            new AllocaInst(allocSize->getType(), 0,
+                           allocSize->getName() + ".sharedMem.ptr", RI);
+        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
+            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
+            allocSize->getName() + ".sharedMem.i8ptr", RI);
+
+        Value *setInputArgs[] = {
+            GraphID, allocSizeI8Ptr,
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            ConstantExpr::getSizeOf(allocSize->getType())};
         CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
       }
     }
   } else {
@@ -615,68 +610,64 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
       ExtractValueInstVec.push_back(EI);
     }
 
-    for (auto& e: K->getSharedInArgMap()) {
+    for (auto &e : K->getSharedInArgMap()) {
       unsigned argNum = e.first;
-      Value* allocSize = ExtractValueInstVec[e.second.second/2];
+      Value *allocSize = ExtractValueInstVec[e.second.second / 2];
 
-      DEBUG(errs() << "\tLocal Memory at "<< argNum << ", size = "  << *allocSize << "\n");
+      DEBUG(errs() << "\tLocal Memory at " << argNum
+                   << ", size = " << *allocSize << "\n");
 
       if (KF->getFunctionType()->getParamType(argNum)->isPointerTy()) {
         // Shared memory ptr argument - scalar at size position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
 
-        Value* setInputArgs[] = {GraphID,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 allocSize
-                                };
+        Value *setInputArgs[] = {
+            GraphID, ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            allocSize};
         CallInst::Create(llvm_visc_ocl_argument_shared,
-                         ArrayRef<Value*>(setInputArgs, 3), "", RI);
-      }
-      else {
+                         ArrayRef<Value *>(setInputArgs, 3), "", RI);
+      } else {
         // Sharem memory size argument - scalar at address position
         switchToTimer(visc_TimerID_COPY_SCALAR, RI);
         // Store the scalar value on stack and then pass the pointer to its
         // location
-        AllocaInst* allocSizePtr = new AllocaInst(allocSize->getType(), 0, 
-            allocSize->getName()+".sharedMem.ptr", RI);
-        StoreInst* SI = new StoreInst(allocSize, allocSizePtr, RI);
-
-        Value* allocSizeI8Ptr = CastInst::CreatePointerCast(allocSizePtr,
-                                Type::getInt8PtrTy(M.getContext()),
-                                allocSize->getName()+".sharedMem.i8ptr",
-                                RI);
-
-        Value* setInputArgs[] = {GraphID,
-                                 allocSizeI8Ptr,
-                                 ConstantInt::get(Type::getInt32Ty(M.getContext()),argNum),
-                                 ConstantExpr::getSizeOf(allocSize->getType())
-                                };
+        AllocaInst *allocSizePtr =
+            new AllocaInst(allocSize->getType(), 0,
+                           allocSize->getName() + ".sharedMem.ptr", RI);
+        StoreInst *SI = new StoreInst(allocSize, allocSizePtr, RI);
+
+        Value *allocSizeI8Ptr = CastInst::CreatePointerCast(
+            allocSizePtr, Type::getInt8PtrTy(M.getContext()),
+            allocSize->getName() + ".sharedMem.i8ptr", RI);
+
+        Value *setInputArgs[] = {
+            GraphID, allocSizeI8Ptr,
+            ConstantInt::get(Type::getInt32Ty(M.getContext()), argNum),
+            ConstantExpr::getSizeOf(allocSize->getType())};
         CallInst::Create(llvm_visc_ocl_argument_scalar,
-                         ArrayRef<Value*>(setInputArgs, 4), "", RI);
+                         ArrayRef<Value *>(setInputArgs, 4), "", RI);
       }
     }
   }
 
-
   DEBUG(errs() << "Setup output edges of node and insert visc api\n");
   // Set output if struct is not an empty struct
-  StructType* OutputTy = K->KernelLeafNode->getOutputType();
-  std::vector<Value*> d_Outputs;
-  if(!OutputTy->isEmptyTy()) {
+  StructType *OutputTy = K->KernelLeafNode->getOutputType();
+  std::vector<Value *> d_Outputs;
+  if (!OutputTy->isEmptyTy()) {
     switchToTimer(visc_TimerID_COPY_PTR, RI);
     // Not an empty struct
     // Iterate over all elements of the struct and put them in
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      unsigned outputIndex = KF->getFunctionType()->getNumParams()+i;
-      Value* setOutputArgs[] = {GraphID,
-                                ConstantInt::get(Type::getInt32Ty(M.getContext()),outputIndex),
-                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
-                               };
-
-      CallInst* d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
-                                            ArrayRef<Value*>(setOutputArgs, 3),
-                                            "d_output."+KF->getName(),
-                                            RI);
+    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
+      unsigned outputIndex = KF->getFunctionType()->getNumParams() + i;
+      Value *setOutputArgs[] = {
+          GraphID,
+          ConstantInt::get(Type::getInt32Ty(M.getContext()), outputIndex),
+          ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+
+      CallInst *d_Output = CallInst::Create(llvm_visc_ocl_output_ptr,
+                                            ArrayRef<Value *>(setOutputArgs, 3),
+                                            "d_output." + KF->getName(), RI);
       d_Outputs.push_back(d_Output);
     }
   }
@@ -690,46 +681,37 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   Value *workDim, *LocalWGPtr, *GlobalWGPtr;
   getExecuteNodeParams(M, workDim, LocalWGPtr, GlobalWGPtr, K, VMap, RI);
   switchToTimer(visc_TimerID_KERNEL, RI);
-  Value* ExecNodeArgs[] = {GraphID,
-                           workDim,
-                           LocalWGPtr,
-                           GlobalWGPtr
-                          };
-  CallInst* Event = CallInst::Create(llvm_visc_ocl_executeNode,
-                                     ArrayRef<Value*>(ExecNodeArgs, 4),
-                                     "event."+KF->getName(),
-                                     RI);
+  Value *ExecNodeArgs[] = {GraphID, workDim, LocalWGPtr, GlobalWGPtr};
+  CallInst *Event = CallInst::Create(llvm_visc_ocl_executeNode,
+                                     ArrayRef<Value *>(ExecNodeArgs, 4),
+                                     "event." + KF->getName(), RI);
   DEBUG(errs() << "Execute Node Call: " << *Event << "\n");
 
   // Wait for Kernel to Finish
-  CallInst::Create(llvm_visc_ocl_wait,
-                   ArrayRef<Value*>(GraphID),
-                   "",
-                   RI);
+  CallInst::Create(llvm_visc_ocl_wait, ArrayRef<Value *>(GraphID), "", RI);
 
   switchToTimer(visc_TimerID_READ_OUTPUT, RI);
   // Read Output Struct if not empty
-  if(!OutputTy->isEmptyTy()) {
-    std::vector<Value*>h_Outputs;
-    Value* KernelOutput = UndefValue::get(OutputTy);
-    for(unsigned i=0; i < OutputTy->getNumElements(); i++) {
-      Value* GetOutputArgs[] = {GraphID,
-                                Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
-                                d_Outputs[i],
-                                ConstantExpr::getSizeOf(OutputTy->getElementType(i))
-                               };
-      CallInst* h_Output = CallInst::Create(llvm_visc_ocl_getOutput,
-                                            ArrayRef<Value*>(GetOutputArgs, 4),
-                                            "h_output."+KF->getName()+".addr",
-                                            RI);
+  if (!OutputTy->isEmptyTy()) {
+    std::vector<Value *> h_Outputs;
+    Value *KernelOutput = UndefValue::get(OutputTy);
+    for (unsigned i = 0; i < OutputTy->getNumElements(); i++) {
+      Value *GetOutputArgs[] = {
+          GraphID, Constant::getNullValue(Type::getInt8PtrTy(M.getContext())),
+          d_Outputs[i], ConstantExpr::getSizeOf(OutputTy->getElementType(i))};
+      CallInst *h_Output = CallInst::Create(
+          llvm_visc_ocl_getOutput, ArrayRef<Value *>(GetOutputArgs, 4),
+          "h_output." + KF->getName() + ".addr", RI);
       // Read each device pointer listed in output struct
       // Load the output struct
-      CastInst* BI = BitCastInst::CreatePointerCast(h_Output,
-                     OutputTy->getElementType(i)->getPointerTo(), "output.ptr", RI);
-
-      Value* OutputElement = new LoadInst(BI, "output."+KF->getName(), RI);
-      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement, ArrayRef<unsigned>(i),
-                                             KF->getName()+"output", RI);
+      CastInst *BI = BitCastInst::CreatePointerCast(
+          h_Output, OutputTy->getElementType(i)->getPointerTo(), "output.ptr",
+          RI);
+
+      Value *OutputElement = new LoadInst(BI, "output." + KF->getName(), RI);
+      KernelOutput = InsertValueInst::Create(KernelOutput, OutputElement,
+                                             ArrayRef<unsigned>(i),
+                                             KF->getName() + "output", RI);
     }
     OutputMap[K->KernelLeafNode] = KernelOutput;
   }
@@ -744,75 +726,76 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
     DEBUG(errs() << "\tTo: " << *output.h_ptr << "\n");
     DEBUG(errs() << "\t#bytes: " << *output.bytes << "\n");
 
-    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr, output.bytes};
-    CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
+    Value* GetOutputArgs[] = {GraphID, output.h_ptr, output.d_ptr,
+  output.bytes}; CallInst* CI = CallInst::Create(llvm_visc_ocl_getOutput,
                                     ArrayRef<Value*>(GetOutputArgs, 4),
                                     "", RI);
   }*/
   switchToTimer(visc_TimerID_MEM_FREE, RI);
   // Clear Context and free device memory
-  DEBUG(errs() << "Clearing context" << "\n");
+  DEBUG(errs() << "Clearing context"
+               << "\n");
   // Free Device Memory
-  for(auto d_ptr: DevicePointers) {
-    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value*>(d_ptr), "", RI);
+  for (auto d_ptr : DevicePointers) {
+    CallInst::Create(llvm_visc_ocl_free, ArrayRef<Value *>(d_ptr), "", RI);
   }
   switchToTimer(visc_TimerID_CLEAR_CTX, CleanupCall);
   // Clear Context
-  LoadInst* LI = new LoadInst(GraphIDAddr, "", CleanupCall);
-  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value*>(LI), "", CleanupCall);
+  LoadInst *LI = new LoadInst(GraphIDAddr, "", CleanupCall);
+  CallInst::Create(llvm_visc_ocl_clearContext, ArrayRef<Value *>(LI), "",
+                   CleanupCall);
   switchToTimer(visc_TimerID_NONE, CleanupCall);
 
   switchToTimer(visc_TimerID_MISC, RI);
   DEBUG(errs() << "*** Generating epilogue code for the function****\n");
   // Generate code for output bindings
   // Get Exit node
-  DFNode* C = N->getChildGraph()->getExit();
+  DFNode *C = N->getChildGraph()->getExit();
   // Get OutputType of this node
-  StructType* OutTy = N->getOutputType();
+  StructType *OutTy = N->getOutputType();
   Value *retVal = UndefValue::get(F_X86->getReturnType());
   // Find the kernel's output arg map, to use instead of the bindings
   std::vector<unsigned> outArgMap = kernel->getOutArgMap();
   // Find all the input edges to exit node
-  for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+  for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
     DEBUG(errs() << "Output Edge " << i << "\n");
     // Find the incoming edge at the requested input port
-    DFEdge* E = C->getInDFEdgeAt(i);
+    DFEdge *E = C->getInDFEdgeAt(i);
 
     assert(E && "No Binding for output element!");
     // Find the Source DFNode associated with the incoming edge
-    DFNode* SrcDF = E->getSourceDF();
+    DFNode *SrcDF = E->getSourceDF();
 
-    DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
+                 << "\n");
 
     // If Source DFNode is a dummyNode, edge is from parent. Get the
     // argument from argument list of this internal node
-    Value* inputVal;
-    if(SrcDF->isEntryNode()) {
+    Value *inputVal;
+    if (SrcDF->isEntryNode()) {
       inputVal = getArgumentAt(F_X86, i);
-      DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-    }
-    else {
+      DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+    } else {
       // edge is from a internal node
       // Check - code should already be generated for this source dfnode
       // FIXME: Since the 2-level kernel code gen has aspecific structure, we
       // can assume the SrcDF is same as Kernel Leaf node.
       // Use outArgMap to get correct mapping
       SrcDF = K->KernelLeafNode;
-      assert(OutputMap.count(SrcDF)
-             && "Source node call not found. Dependency violation!");
+      assert(OutputMap.count(SrcDF) &&
+             "Source node call not found. Dependency violation!");
 
       // Find Output Value associated with the Source DFNode using OutputMap
-      Value* CI = OutputMap[SrcDF];
+      Value *CI = OutputMap[SrcDF];
 
       // Extract element at source position from this call instruction
       std::vector<unsigned> IndexList;
       // i is the destination of DFEdge E
       // Use the mapping instead of the bindings
-//      IndexList.push_back(E->getSourcePosition());
+      //      IndexList.push_back(E->getSourcePosition());
       IndexList.push_back(outArgMap[i]);
-      DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-      ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                             "",RI);
+      DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
+      ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
       inputVal = EI;
     }
     std::vector<unsigned> IdxList;
@@ -823,29 +806,31 @@ void CGT_NVPTX::insertRuntimeCalls(DFInternalNode* N, Kernel* K, const Twine& Fi
   DEBUG(errs() << "Extracted all\n");
   switchToTimer(visc_TimerID_NONE, RI);
   retVal->setName("output");
-  ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+  ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal);
   ReplaceInstWithInst(RI, newRI);
 }
 
-
 // Right now, only targeting the one level case. In general, device functions
 // can return values so we don't need to change them
-void CGT_NVPTX::codeGen(DFInternalNode* N) {
-  DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName() << "\n");
-  if(KernelLaunchNode == NULL)
+void CGT_NVPTX::codeGen(DFInternalNode *N) {
+  DEBUG(errs() << "Inside internal node: " << N->getFuncPointer()->getName()
+               << "\n");
+  if (KernelLaunchNode == NULL)
     DEBUG(errs() << "No kernel launch node\n");
   else {
-    DEBUG(errs() << "KernelLaunchNode: " << KernelLaunchNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "KernelLaunchNode: "
+                 << KernelLaunchNode->getFuncPointer()->getName() << "\n");
   }
 
   if (!KernelLaunchNode) {
-    DEBUG(errs() << "No code generated (host code for kernel launch complete).\n");
+    DEBUG(errs()
+          << "No code generated (host code for kernel launch complete).\n");
     return;
   }
 
   if (N == KernelLaunchNode) {
     DEBUG(errs() << "Found kernel launch node. Generating host code.\n");
-    //TODO
+    // TODO
 
     // Now the remaining nodes to be visited should be ignored
     KernelLaunchNode = NULL;
@@ -860,7 +845,8 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
     // TODO: Structure assumed: one thread node, one allocation node (at most),
     // TB node
     std::map<unsigned, unsigned> inmapFinal;
-    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(), ie = inmap2.end();
+    for (std::map<unsigned, unsigned>::iterator ib = inmap2.begin(),
+                                                ie = inmap2.end();
          ib != ie; ++ib) {
       inmapFinal[ib->first] = inmap1[ib->second];
     }
@@ -877,8 +863,9 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
     // 0 ... outmap2.size()-1
     // The limit is the size of outmap2, because this is the number of kernel
     // output arguments for which the mapping matters
-    // For now, it reasonable to assume that all the kernel arguments are returned,
-    // maybe plys some others from other nodes, thus outmap2.size() <= outmap1.size()
+    // For now, it reasonable to assume that all the kernel arguments are
+    // returned, maybe plys some others from other nodes, thus outmap2.size() <=
+    // outmap1.size()
     for (unsigned i = 0; i < outmap2.size(); i++) {
       outmap1[i] = outmap2[outmap1[i]];
     }
@@ -886,15 +873,14 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
 
     // Track the source of local dimlimits for the kernel
     // Dimension limit can either be a constant or an argument of parent
-    // function. Since Internal node would no longer exist, we need to insert the
-    // localWGSize with values from the parent of N.
-    std::vector<Value*> localWGSizeMapped;
+    // function. Since Internal node would no longer exist, we need to insert
+    // the localWGSize with values from the parent of N.
+    std::vector<Value *> localWGSizeMapped;
     for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
       if (isa<Constant>(kernel->localWGSize[i])) {
         // if constant, use as it is
         localWGSizeMapped.push_back(kernel->localWGSize[i]);
-      }
-      else if (Argument* Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
+      } else if (Argument *Arg = dyn_cast<Argument>(kernel->localWGSize[i])) {
         // if argument, find the argument location in N. Use InArgMap of N to
         // find the source location in Parent of N. Retrieve the argument from
         // parent to insert in the vector.
@@ -904,46 +890,49 @@ void CGT_NVPTX::codeGen(DFInternalNode* N) {
         assert(N->getInArgMap().find(argNum) != N->getInArgMap().end());
 
         unsigned parentArgNum = N->getInArgMap()[argNum];
-        Argument* A = getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
+        Argument *A =
+            getArgumentAt(N->getParent()->getFuncPointer(), parentArgNum);
         localWGSizeMapped.push_back(A);
-      }
-      else {
-        assert(false && "LocalWGsize using value which is neither argument nor constant!");
+      } else {
+        assert(
+            false &&
+            "LocalWGsize using value which is neither argument nor constant!");
       }
     }
     // Update localWGSize vector of kernel
     kernel->setLocalWGSize(localWGSizeMapped);
   }
-
 }
 
-void CGT_NVPTX::codeGen(DFLeafNode* N) {
-  DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName() << "\n");
+void CGT_NVPTX::codeGen(DFLeafNode *N) {
+  DEBUG(errs() << "Inside leaf node: " << N->getFuncPointer()->getName()
+               << "\n");
 
   // Skip code generation if it is a dummy node
-  if(N->isDummyNode()) {
+  if (N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
 
   // Skip code generation if it is an allocation node
-  if(N->isAllocationNode()) {
+  if (N->isAllocationNode()) {
     DEBUG(errs() << "Skipping allocation node\n");
     return;
   }
 
   // Generate code only if it has the right hint
-//  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
-//    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
-//    return;
-//  }
-  if(!preferredTargetIncludes(N, visc::GPU_TARGET)) {
-    DEBUG(errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n");
+  //  if(!checkPreferredTarget(N, visc::GPU_TARGET)) {
+  //    errs() << "Skipping node: "<< N->getFuncPointer()->getName() << "\n";
+  //    return;
+  //  }
+  if (!preferredTargetIncludes(N, visc::GPU_TARGET)) {
+    DEBUG(errs() << "Skipping node: " << N->getFuncPointer()->getName()
+                 << "\n");
     return;
   }
 
   // Checking which node is the kernel launch
-  DFNode* PNode = N->getParent();
+  DFNode *PNode = N->getParent();
   int pLevel = PNode->getLevel();
   int pReplFactor = PNode->getNumOfDim();
 
@@ -956,37 +945,35 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   assert((pLevel > 0) && "Root not allowed to be chosen as Kernel Node.");
 
   // Only these options are supported
-  enum XLevelHierarchy{ONE_LEVEL, TWO_LEVEL} SelectedHierarchy;
-  if(pLevel == 1 || !pReplFactor) {
-    DEBUG(errs() << "*************** Kernel Gen: 1-Level Hierarchy **************\n");
+  enum XLevelHierarchy { ONE_LEVEL, TWO_LEVEL } SelectedHierarchy;
+  if (pLevel == 1 || !pReplFactor) {
+    DEBUG(errs()
+          << "*************** Kernel Gen: 1-Level Hierarchy **************\n");
     SelectedHierarchy = ONE_LEVEL;
     KernelLaunchNode = PNode;
-    kernel = new Kernel(NULL,
-                        N,
-                        N->getInArgMap(),
-                        N->getSharedInArgMap(),
-                        N->getOutArgMap(),
-                        N->getNumOfDim(),
-                        N->getDimLimits());
-  }
-  else {
+    kernel = new Kernel(NULL, N, N->getInArgMap(), N->getSharedInArgMap(),
+                        N->getOutArgMap(), N->getNumOfDim(), N->getDimLimits());
+  } else {
     // Converting a 2-level DFG to opencl kernel
-    DEBUG(errs() << "*************** Kernel Gen: 2-Level Hierarchy **************\n");
-    assert((pLevel >= 2) && "Selected node not nested deep enough to be Kernel Node.");
+    DEBUG(errs()
+          << "*************** Kernel Gen: 2-Level Hierarchy **************\n");
+    assert((pLevel >= 2) &&
+           "Selected node not nested deep enough to be Kernel Node.");
     SelectedHierarchy = TWO_LEVEL;
     KernelLaunchNode = PNode->getParent();
-    assert((PNode->getNumOfDim() == N->getNumOfDim()) && "Dimension number must match");
+    assert((PNode->getNumOfDim() == N->getNumOfDim()) &&
+           "Dimension number must match");
     // Contains the instructions generating the kernel configuration parameters
-    kernel = new Kernel(NULL,                 // kernel function
-                        N,                    // kernel leaf node
-                        N->getInArgMap(),     // kenel argument mapping
+    kernel = new Kernel(NULL,             // kernel function
+                        N,                // kernel leaf node
+                        N->getInArgMap(), // kenel argument mapping
                         N->getSharedInArgMap(),
-                        N->getOutArgMap(),    // kernel output mapping from the leaf to the interemediate node
-                        PNode->getNumOfDim(), // gridDim
-                        PNode->getDimLimits(),// grid size
-                        N->getNumOfDim(),     // blockDim
-                        N->getDimLimits());   // block size
-
+                        N->getOutArgMap(),     // kernel output mapping from the
+                                               // leaf to the interemediate node
+                        PNode->getNumOfDim(),  // gridDim
+                        PNode->getDimLimits(), // grid size
+                        N->getNumOfDim(),      // blockDim
+                        N->getDimLimits());    // block size
   }
 
   std::vector<Instruction *> IItoRemove;
@@ -998,58 +985,62 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Look up if we have visited this function before. If we have, then just
   // get the cloned function pointer from DFNode. Otherwise, create the cloned
   // function and add it to the DFNode GenFunc.
-//  Function *F_nvptx = N->getGenFunc();
+  //  Function *F_nvptx = N->getGenFunc();
   Function *F_nvptx = N->getGenFuncForTarget(visc::GPU_TARGET);
 
-  assert(F_nvptx == NULL && "Error: Visiting a node for which code already generated");
+  assert(F_nvptx == NULL &&
+         "Error: Visiting a node for which code already generated");
   // Clone the function
   ValueToValueMapTy VMap;
 
-  //F_nvptx->setName(FName+"_nvptx");
+  // F_nvptx->setName(FName+"_nvptx");
 
   Twine FName = F->getName();
   StringRef fStr = FName.getSingleStringRef();
-  Twine newFName = Twine(fStr, "_nvptx"); 
+  Twine newFName = Twine(fStr, "_nvptx");
   F_nvptx = CloneFunction(F, VMap);
   F_nvptx->setName(newFName);
 
-  
   //  errs() << "Old Function Name: " << F->getName() << "\n";
   //  errs() << "New Function Name: " << F_nvptx->getName() << "\n";
 
   F_nvptx->removeFromParent();
 
-
   // Insert the cloned function into the kernels module
   KernelM->getFunctionList().push_back(F_nvptx);
 
-
-  //TODO: Iterate over all the instructions of F_nvptx and identify the
-  //callees and clone them into this module.
+  // TODO: Iterate over all the instructions of F_nvptx and identify the
+  // callees and clone them into this module.
   DEBUG(errs() << *F_nvptx->getType());
   DEBUG(errs() << *F_nvptx);
 
   // Transform  the function to void and remove all target dependent attributes
   // from the function
   F_nvptx = transformFunctionToVoid(F_nvptx);
-  
-  //Add generated function info to DFNode
-//  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
+
+  // Add generated function info to DFNode
+  //  N->setGenFunc(F_nvptx, visc::GPU_TARGET);
   N->addGenFunc(F_nvptx, visc::GPU_TARGET, false);
 
-  DEBUG(errs() << "Removing all attributes from Kernel Function and adding nounwind\n");
-  F_nvptx->removeAttributes(AttributeList::FunctionIndex, F_nvptx->getAttributes().getFnAttributes());
+  DEBUG(
+      errs()
+      << "Removing all attributes from Kernel Function and adding nounwind\n");
+  F_nvptx->removeAttributes(AttributeList::FunctionIndex,
+                            F_nvptx->getAttributes().getFnAttributes());
   F_nvptx->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
 
-  //FIXME: For now, assume only one allocation node
+  // FIXME: For now, assume only one allocation node
   kernel->AllocationNode = NULL;
 
-  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(), iee = N->indfedge_end();
+  for (DFNode::const_indfedge_iterator ieb = N->indfedge_begin(),
+                                       iee = N->indfedge_end();
        ieb != iee; ++ieb) {
     DFNode *SrcDFNode = (*ieb)->getSourceDF();
-    DEBUG(errs() << "Found edge from node: " << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
+    DEBUG(errs() << "Found edge from node: "
+                 << " " << SrcDFNode->getFuncPointer()->getName() << "\n");
     DEBUG(errs() << "Current Node: " << N->getFuncPointer()->getName() << "\n");
-    DEBUG(errs() << "isAllocationNode = "<< SrcDFNode->isAllocationNode() << "\n");
+    DEBUG(errs() << "isAllocationNode = " << SrcDFNode->isAllocationNode()
+                 << "\n");
     if (!SrcDFNode->isDummyNode()) {
       assert(SrcDFNode->isAllocationNode());
       kernel->AllocationNode = dyn_cast<DFLeafNode>(SrcDFNode);
@@ -1065,10 +1056,11 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   if (kernel->AllocationNode) {
 
     ValueToValueMapTy VMap;
-    Function *F_alloc = CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
-    //F_alloc->removeFromParent();
+    Function *F_alloc =
+        CloneFunction(kernel->AllocationNode->getFuncPointer(), VMap);
+    // F_alloc->removeFromParent();
     // Insert the cloned function into the kernels module
-    //M.getFunctionList().push_back(F_alloc);
+    // M.getFunctionList().push_back(F_alloc);
 
     std::vector<IntrinsicInst *> ViscMallocInstVec;
     findIntrinsicInst(F_alloc, Intrinsic::visc_malloc, ViscMallocInstVec);
@@ -1076,7 +1068,8 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
     for (unsigned i = 0; i < ViscMallocInstVec.size(); i++) {
       IntrinsicInst *II = ViscMallocInstVec[i];
       assert(II->hasOneUse() && "visc_malloc result is used more than once");
-      II->replaceAllUsesWith(ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
+      II->replaceAllUsesWith(
+          ConstantPointerNull::get(Type::getInt8PtrTy(M.getContext())));
       II->eraseFromParent();
     }
     kernel->AllocationFunction = F_alloc;
@@ -1091,15 +1084,19 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         assert(RetStructTy && "Allocation node does not return a struct type");
         unsigned numFields = RetStructTy->getNumElements();
     */
-    std::map<unsigned, std::pair<Value*, unsigned> > sharedInMap = kernel->getSharedInArgMap();
-    AllocationNodeProperty* APN =
-      (AllocationNodeProperty*) kernel->AllocationNode->getProperty(DFNode::Allocation);
-    for (auto& AllocPair: APN->getAllocationList()) {
+    std::map<unsigned, std::pair<Value *, unsigned>> sharedInMap =
+        kernel->getSharedInArgMap();
+    AllocationNodeProperty *APN =
+        (AllocationNodeProperty *)kernel->AllocationNode->getProperty(
+            DFNode::Allocation);
+    for (auto &AllocPair : APN->getAllocationList()) {
       unsigned destPos = AllocPair.first->getDestPosition();
       unsigned srcPos = AllocPair.first->getSourcePosition();
       SharedMemArgs.push_back(destPos);
-      sharedInMap[destPos] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
-      sharedInMap[destPos+1] = std::pair<Value *, unsigned>(AllocPair.second, srcPos+1);
+      sharedInMap[destPos] =
+          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
+      sharedInMap[destPos + 1] =
+          std::pair<Value *, unsigned>(AllocPair.second, srcPos + 1);
     }
     kernel->setSharedInArgMap(sharedInMap);
   }
@@ -1109,12 +1106,14 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // global address space
   unsigned argIndex = 0;
   std::vector<unsigned> GlobalMemArgs;
-  for(Function::arg_iterator ai = F_nvptx->arg_begin(), ae = F_nvptx->arg_end();
-      ai != ae; ++ai) {
-    if (ai->getType()->isPointerTy()) {    
-      // If the arguement is already chosen for shared memory arguemnt list, skip.
-      // Else put it in Global memory arguement list
-      if(std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) == 0) {
+  for (Function::arg_iterator ai = F_nvptx->arg_begin(),
+                              ae = F_nvptx->arg_end();
+       ai != ae; ++ai) {
+    if (ai->getType()->isPointerTy()) {
+      // If the arguement is already chosen for shared memory arguemnt list,
+      // skip. Else put it in Global memory arguement list
+      if (std::count(SharedMemArgs.begin(), SharedMemArgs.end(), argIndex) ==
+          0) {
         GlobalMemArgs.push_back(argIndex);
       }
     }
@@ -1128,20 +1127,21 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
   // Optimization: Gloabl memory arguments, which are not modified and whose
   // loads are not dependent on node id of current node, should be moved to
   // constant memory, subject to size of course
-  std::vector<unsigned> ConstantMemArgs = globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
+  std::vector<unsigned> ConstantMemArgs =
+      globalToConstantMemoryOpt(&GlobalMemArgs, F_nvptx);
 
   F_nvptx = changeArgAddrspace(F_nvptx, ConstantMemArgs, GLOBAL_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, SharedMemArgs, SHARED_ADDRSPACE);
   F_nvptx = changeArgAddrspace(F_nvptx, GlobalMemArgs, GLOBAL_ADDRSPACE);
 
-// Function to replace call instructions to functions in the kernel
+  // Function to replace call instructions to functions in the kernel
   std::map<Function *, Function *> OrgToClonedFuncMap;
   std::vector<Function *> FuncToBeRemoved;
-  auto CloneAndReplaceCall = [&] (CallInst *CI, Function *OrgFunc) {
-    Function* NewFunc;
+  auto CloneAndReplaceCall = [&](CallInst *CI, Function *OrgFunc) {
+    Function *NewFunc;
     // Check if the called function has already been cloned before.
     auto It = OrgToClonedFuncMap.find(OrgFunc);
-    if(It == OrgToClonedFuncMap.end()) {
+    if (It == OrgToClonedFuncMap.end()) {
       ValueToValueMapTy VMap;
       NewFunc = CloneFunction(OrgFunc, VMap);
       OrgToClonedFuncMap[OrgFunc] = NewFunc;
@@ -1150,42 +1150,47 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       NewFunc = (*It).second;
     }
     // Replace the calls to this function
-    std::vector<Value*> args;
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+    std::vector<Value *> args;
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
       args.push_back(CI->getArgOperand(i));
     }
-    CallInst* Inst = CallInst::Create(NewFunc, args,
-        OrgFunc->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+    CallInst *Inst = CallInst::Create(
+        NewFunc, args,
+        OrgFunc->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
     CI->replaceAllUsesWith(Inst);
     IItoRemove.push_back(CI);
     return NewFunc;
   };
 
-
   // Go through all the instructions
-  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
+       ++i) {
     Instruction *I = &(*i);
     // Leaf nodes should not contain VISC graph intrinsics or launch
-    assert(!BuildDFG::isViscLaunchIntrinsic(I) && "Launch intrinsic within a dataflow graph!");
-    assert(!BuildDFG::isViscGraphIntrinsic(I) && "VISC graph intrinsic within a leaf dataflow node!");
+    assert(!BuildDFG::isViscLaunchIntrinsic(I) &&
+           "Launch intrinsic within a dataflow graph!");
+    assert(!BuildDFG::isViscGraphIntrinsic(I) &&
+           "VISC graph intrinsic within a leaf dataflow node!");
 
     if (BuildDFG::isViscIntrinsic(I)) {
-      IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-      IntrinsicInst* ArgII;
-      DFNode* ArgDFNode;
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+      IntrinsicInst *ArgII;
+      DFNode *ArgDFNode;
 
-      /************************ Handle VISC Query intrinsics ************************/
+      /************************ Handle VISC Query intrinsics
+       * ************************/
 
       switch (II->getIntrinsicID()) {
-      /**************************** llvm.visc.getNode() *****************************/
+      /**************************** llvm.visc.getNode()
+       * *****************************/
       case Intrinsic::visc_getNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNode\n");
         // add mapping <intrinsic, this node> to the node-specific map
         Leaf_HandleToDFNodeMap[II] = N;
         IItoRemove.push_back(II);
-      }
-      break;
-      /************************* llvm.visc.getParentNode() **************************/
+      } break;
+      /************************* llvm.visc.getParentNode()
+       * **************************/
       case Intrinsic::visc_getParentNode: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getParentNode\n");
         // get the parent node of the arg node
@@ -1199,9 +1204,9 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         Leaf_HandleToDFNodeMap[II] = ArgDFNode->getParent();
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /*************************** llvm.visc.getNumDims() ***************************/
+      } break;
+      /*************************** llvm.visc.getNumDims()
+       * ***************************/
       case Intrinsic::visc_getNumDims: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumDims\n");
         // get node from map
@@ -1210,47 +1215,48 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         int numOfDim = ArgDFNode->getNumOfDim();
         DEBUG(errs() << "\t  Got node dimension : " << numOfDim << "\n");
-        IntegerType* IntTy = Type::getInt32Ty(KernelM->getContext());
-        ConstantInt* numOfDimConstant = ConstantInt::getSigned(IntTy, (int64_t) numOfDim);
+        IntegerType *IntTy = Type::getInt32Ty(KernelM->getContext());
+        ConstantInt *numOfDimConstant =
+            ConstantInt::getSigned(IntTy, (int64_t)numOfDim);
 
         // Replace the result of the intrinsic with the computed value
         II->replaceAllUsesWith(numOfDimConstant);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /*********************** llvm.visc.getNodeInstanceID() ************************/
+      } break;
+      /*********************** llvm.visc.getNodeInstanceID()
+       * ************************/
       case Intrinsic::visc_getNodeInstanceID_x:
       case Intrinsic::visc_getNodeInstanceID_y:
       case Intrinsic::visc_getNodeInstanceID_z: {
-        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n" << "\t: " << *II << "\n");
+        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNodeInstanceID\n"
+                     << "\t: " << *II << "\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         assert(ArgDFNode && "Arg node is NULL");
         // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
+        DFNode *ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNodeInstanceID_x;
+        uint64_t dim =
+            II->getIntrinsicID() - Intrinsic::visc_getNodeInstanceID_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
+        ConstantInt *DimConstant =
+            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        // ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function * OpenCLFunction;
+        Function *OpenCLFunction;
 
-        FunctionType* FT =
-          FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                            Type::getInt32Ty(KernelM->getContext()),
-                            false);
+        FunctionType *FT =
+            FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
+                              Type::getInt32Ty(KernelM->getContext()), false);
         if (SelectedHierarchy == ONE_LEVEL && ArgDFNode == N) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
@@ -1259,20 +1265,23 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           // itself
           DEBUG(errs() << "Substitute with get_global_id()\n");
           DEBUG(errs() << *II << "\n");
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_global_id"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_global_id"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
-          //DEBUG(errs() << "Here inside cond 2\n");
+          // DEBUG(errs() << "Here inside cond 2\n");
           // We are asking for this node's id with respect to its parent
           // this is a local id call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_local_id"), FT)).getCallee());
-          //DEBUG(errs() << "exiting condition 2\n");
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_local_id"), FT))
+                  .getCallee());
+          // DEBUG(errs() << "exiting condition 2\n");
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's id with respect to its
           // parent: this is a group id call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_group_id"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_group_id"), FT))
+                  .getCallee());
         } else {
           DEBUG(errs() << N->getFuncPointer()->getName() << "\n");
           DEBUG(errs() << N->getParent()->getFuncPointer()->getName() << "\n");
@@ -1281,21 +1290,21 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
           assert(false && "Unable to translate getNodeInstanceID intrinsic");
         }
 
-        //DEBUG(errs() << "Create call instruction, insert it before the instrinsic\n");
-        //DEBUG(errs() << "Function: " << *OpenCLFunction << "\n");
-        //DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
-        //DEBUG(errs() << "Argument: " << Args[0] << "\n");
-        //DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
+        // DEBUG(errs() << "Create call instruction, insert it before the
+        // instrinsic\n"); DEBUG(errs() << "Function: " << *OpenCLFunction <<
+        // "\n"); DEBUG(errs() << "Arguments size: " << Args.size() << "\n");
+        // DEBUG(errs() << "Argument: " << Args[0] << "\n");
+        // DEBUG(errs() << "Arguments: " << *DimConstant << "\n");
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
-        //DEBUG(errs() << "Replace uses\n");
+        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        // DEBUG(errs() << "Replace uses\n");
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      /********************** llvm.visc.getNumNodeInstances() ***********************/
+      } break;
+      /********************** llvm.visc.getNumNodeInstances()
+       * ***********************/
       case Intrinsic::visc_getNumNodeInstances_x:
       case Intrinsic::visc_getNumNodeInstances_y:
       case Intrinsic::visc_getNumNodeInstances_z: {
@@ -1304,78 +1313,82 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
         // then, why do we need to keep that info in the graph?  (only for the
         // kernel configuration during the call)
 
-        DEBUG(errs() << F_nvptx->getName() << "\t: Handling getNumNodeInstances\n");
+        DEBUG(errs() << F_nvptx->getName()
+                     << "\t: Handling getNumNodeInstances\n");
         ArgII = cast<IntrinsicInst>((II->getOperand(0))->stripPointerCasts());
         ArgDFNode = Leaf_HandleToDFNodeMap[ArgII];
         // A leaf node always has a parent
-        DFNode* ParentDFNode = ArgDFNode->getParent();
+        DFNode *ParentDFNode = ArgDFNode->getParent();
         assert(ParentDFNode && "Parent node of a leaf is NULL");
 
         // Get the number associated with the required dimension
         // FIXME: The order is important!
         // These three intrinsics need to be consecutive x,y,z
-        uint64_t dim = II->getIntrinsicID() -
-                       Intrinsic::visc_getNumNodeInstances_x;
+        uint64_t dim =
+            II->getIntrinsicID() - Intrinsic::visc_getNumNodeInstances_x;
         assert((dim < 3) && "Invalid dimension argument");
         DEBUG(errs() << "\t  dimension = " << dim << "\n");
 
         // Argument of the function to be called
-        ConstantInt * DimConstant =
-          ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
-        //ArrayRef<Value *> Args(DimConstant);
+        ConstantInt *DimConstant =
+            ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), dim);
+        // ArrayRef<Value *> Args(DimConstant);
 
         // The following is to find which function to call
-        Function * OpenCLFunction;
-        FunctionType* FT =
+        Function *OpenCLFunction;
+        FunctionType *FT =
             FunctionType::get(Type::getInt64Ty(KernelM->getContext()),
-                              Type::getInt32Ty(KernelM->getContext()),
-                              false);
+                              Type::getInt32Ty(KernelM->getContext()), false);
 
         if (N == ArgDFNode && SelectedHierarchy == ONE_LEVEL) {
           // We only have one level in the hierarchy or the parent node is not
           // replicated. This indicates that the parent node is the kernel
           // launch, so the instances are global_size (gridDim x blockDim)
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_global_size"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_global_size"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N) {
           // We are asking for this node's instances
           // this is a local size (block dim) call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_local_size"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_local_size"), FT))
+                  .getCallee());
         } else if (Leaf_HandleToDFNodeMap[ArgII] == N->getParent()) {
           // We are asking for this node's parent's instances
           // this is a (global_size/local_size) (grid dim) call
-          OpenCLFunction = cast<Function>
-                           ((KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT)).getCallee());
+          OpenCLFunction = cast<Function>(
+              (KernelM->getOrInsertFunction(StringRef("get_num_groups"), FT))
+                  .getCallee());
         } else {
           assert(false && "Unable to translate getNumNodeInstances intrinsic");
         }
 
         // Create call instruction, insert it before the intrinsic and
         // replace the uses of the previous instruction with the new one
-        CallInst* CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
+        CallInst *CI = CallInst::Create(OpenCLFunction, DimConstant, "", II);
         II->replaceAllUsesWith(CI);
 
         IItoRemove.push_back(II);
-      }
-      break;
-      case Intrinsic::visc_barrier:
-      {
+      } break;
+      case Intrinsic::visc_barrier: {
         DEBUG(errs() << F_nvptx->getName() << "\t: Handling barrier\n");
         DEBUG(errs() << "Substitute with barrier()\n");
         DEBUG(errs() << *II << "\n");
-        FunctionType* FT = FunctionType::get(Type::getVoidTy(KernelM->getContext()),
-                                             std::vector<Type*>(1, Type::getInt32Ty(KernelM->getContext())),
-                                             false);
-        Function* OpenCLFunction = cast<Function>
-                                   ((KernelM->getOrInsertFunction(StringRef("barrier"), FT)).getCallee());
-        CallInst* CI = CallInst::Create(OpenCLFunction,
-                                        ArrayRef<Value*>(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)),
-                                        "", II);
+        FunctionType *FT = FunctionType::get(
+            Type::getVoidTy(KernelM->getContext()),
+            std::vector<Type *>(1, Type::getInt32Ty(KernelM->getContext())),
+            false);
+        Function *OpenCLFunction = cast<Function>(
+            (KernelM->getOrInsertFunction(StringRef("barrier"), FT))
+                .getCallee());
+        CallInst *CI =
+            CallInst::Create(OpenCLFunction,
+                             ArrayRef<Value *>(ConstantInt::get(
+                                 Type::getInt32Ty(KernelM->getContext()), 1)),
+                             "", II);
         II->replaceAllUsesWith(CI);
         IItoRemove.push_back(II);
-      }
-      break;
+      } break;
       case Intrinsic::visc_atomic_cmpxchg:
         break;
       case Intrinsic::visc_atomic_add:
@@ -1386,607 +1399,627 @@ void CGT_NVPTX::codeGen(DFLeafNode* N) {
       case Intrinsic::visc_atomic_and:
       case Intrinsic::visc_atomic_or:
       case Intrinsic::visc_atomic_xor:
-        //case Intrinsic::visc_atomic_inc:
-        //case Intrinsic::visc_atomic_dec:
-      {
-        DEBUG(errs() << *II << "\n");
-        // Only have support for i32 atomic intrinsics
-        assert(II->getType() == Type::getInt32Ty(II->getContext())
-               && "Only support i32 atomic intrinsics for now");
-        // Substitute with atomicrmw instruction
-        assert(II->getNumArgOperands() == 2 && "Expecting 2 operands for these atomics");
-        Value* Ptr = II->getArgOperand(0);
-        Value* Val = II->getArgOperand(1);
-        assert(Ptr->getType()->isPointerTy()
-               && "First argument of supported atomics is expected to be a pointer");
-        PointerType* PtrTy = cast<PointerType>(Ptr->getType());
-        PointerType* TargetTy = Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
-        if (PtrTy != TargetTy) {
-          Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
-          PtrTy = TargetTy;
+        // case Intrinsic::visc_atomic_inc:
+        // case Intrinsic::visc_atomic_dec:
+        {
+          DEBUG(errs() << *II << "\n");
+          // Only have support for i32 atomic intrinsics
+          assert(II->getType() == Type::getInt32Ty(II->getContext()) &&
+                 "Only support i32 atomic intrinsics for now");
+          // Substitute with atomicrmw instruction
+          assert(II->getNumArgOperands() == 2 &&
+                 "Expecting 2 operands for these atomics");
+          Value *Ptr = II->getArgOperand(0);
+          Value *Val = II->getArgOperand(1);
+          assert(Ptr->getType()->isPointerTy() &&
+                 "First argument of supported atomics is expected to be a "
+                 "pointer");
+          PointerType *PtrTy = cast<PointerType>(Ptr->getType());
+          PointerType *TargetTy =
+              Type::getInt32PtrTy(II->getContext(), PtrTy->getAddressSpace());
+          if (PtrTy != TargetTy) {
+            Ptr = CastInst::CreatePointerCast(Ptr, TargetTy, "", II);
+            PtrTy = TargetTy;
+          }
+
+          std::string name;
+          if (II->getIntrinsicID() == Intrinsic::visc_atomic_add)
+            name = "atomic_add";
+          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
+            name = "atomic_sub";
+          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
+            name = "atomic_xchg";
+          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_min)
+            name = "atomic_min";
+          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_max)
+            name = "atomic_max";
+          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_and)
+            name = "atomic_and";
+          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_or)
+            name = "atomic_or";
+          else if (II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
+            name = "atomic_xor";
+          Type *paramTypes[] = {PtrTy, Val->getType()};
+          FunctionType *AtomFuncT = FunctionType::get(
+              II->getType(), ArrayRef<Type *>(paramTypes, 2), false);
+          FunctionCallee AtomFunc =
+              KernelM->getOrInsertFunction(name, AtomFuncT);
+
+          Value *Params[] = {Ptr, Val};
+          CallInst *AtomCI = CallInst::Create(
+              AtomFunc, ArrayRef<Value *>(Params, 2), II->getName(), II);
+          DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
+          II->replaceAllUsesWith(AtomCI);
+          IItoRemove.push_back(II);
         }
+        break;
+      default:
+        llvm_unreachable("Unknown VISC Intrinsic!");
+        break;
+      }
 
-			 std::string name;
-			 if(II->getIntrinsicID() == Intrinsic::visc_atomic_add)
-				 name = "atomic_add";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_sub)
-				 name = "atomic_sub";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xchg)
-				 name = "atomic_xchg";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_min)
-				 name = "atomic_min";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_max)
-				 name = "atomic_max";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_and)
-				 name = "atomic_and";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_or)
-				 name = "atomic_or";
-			 else if(II->getIntrinsicID() == Intrinsic::visc_atomic_xor)
-				 name = "atomic_xor";
-			 Type* paramTypes[] = {PtrTy, Val->getType()};
-			 FunctionType * AtomFuncT = FunctionType::get(II->getType(), ArrayRef<Type*>(paramTypes,2), false);	
-			 FunctionCallee AtomFunc = KernelM->getOrInsertFunction(name, AtomFuncT);				
-
-			 Value* Params[] = {Ptr, Val};
-			 CallInst* AtomCI = CallInst::Create(AtomFunc, ArrayRef<Value*>(Params,2), II->getName(), II);
-			 DEBUG(errs() << "Substitute with: " << *AtomCI << "\n");
-			 II->replaceAllUsesWith(AtomCI);
-			 IItoRemove.push_back(II);
-			}
-			break;
-			default:
-			llvm_unreachable("Unknown VISC Intrinsic!");
-			break;
-			}
-
-		}
-		else if(MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
-			IRBuilder<> Builder(I);
-			Value *Source = MemCpyI->getSource();
-			Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
-			Value *Length = MemCpyI->getOperand(2);
-			DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
-			DEBUG(errs() << "Source: " << *Source << "\n"); 
-			DEBUG(errs() << "Destination: " << *Destination << "\n"); 
-			DEBUG(errs() << "Length: " << *Length << "\n");
-
-			size_t memcpy_length;
-			unsigned int memcpy_count;
-			if (ConstantInt* CI = dyn_cast<ConstantInt>(Length)) {
-				if (CI->getBitWidth() <= 64) {
-					memcpy_length = CI->getSExtValue();
-					DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
-					Type *Source_Type = Source->getType()->getPointerElementType();
-					DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
-					memcpy_count = memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
-					DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
-					if (GetElementPtrInst *sourceGEPI = dyn_cast<GetElementPtrInst>(Source)) {
-						if (GetElementPtrInst *destGEPI = dyn_cast<GetElementPtrInst>(Destination)) {
-							Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
-							Value *DestPtrOperand = destGEPI->getPointerOperand();
-							for(int i = 0; i < memcpy_count; ++i) {
-								Constant *increment;
-								LoadInst *newLoadI;
-								StoreInst *newStoreI;
-								// First, need to increment the correct index for both source and dest 
-								// This invluves checking to see how many indeces the GEP has
-								// Assume for now only 1 or 2 are the viable options.
-
-								std::vector<Value*> GEPlIndex;
-								if (sourceGEPI->getNumIndices() == 1) {
-									Value *Index = sourceGEPI->getOperand(1);      
-									increment = ConstantInt::get(Index->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPlIndex.push_back(incAdd);
-									Value *newGEPIl = Builder.CreateGEP(SourcePtrOperand, ArrayRef<Value*>(GEPlIndex));
-									DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
-									newLoadI = Builder.CreateLoad(newGEPIl);
-									DEBUG(errs() << "Load: " << *newLoadI << "\n");
-								} else { 
-									llvm_unreachable("Unhandled case where source GEPI has more than 1 indices!\n");
-								}
-
-
-								std::vector<Value*> GEPsIndex;
-								if (destGEPI->getNumIndices() == 1) {
-
-								} else if (destGEPI->getNumIndices() == 2) {
-									Value *Index0 = destGEPI->getOperand(1);      
-									GEPsIndex.push_back(Index0);
-									Value *Index1 = destGEPI->getOperand(2);      
-									increment = ConstantInt::get(Index1->getType(), i, false);
-									Value *incAdd = Builder.CreateAdd(Index1, increment);
-									DEBUG(errs() << "Add: " << *incAdd << "\n");
-									GEPsIndex.push_back(incAdd);
-									Value *newGEPIs = Builder.CreateGEP(DestPtrOperand, ArrayRef<Value*>(GEPsIndex));
-									DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
-									newStoreI = Builder.CreateStore(newLoadI, newGEPIs, MemCpyI->isVolatile());
-									DEBUG(errs() << "Store: " << *newStoreI << "\n");
-								} else {
-									llvm_unreachable("Unhandled case where dest GEPI has more than 2 indices!\n");
-								}
-							}
-							IItoRemove.push_back(sourceGEPI);
-							IItoRemove.push_back(destGEPI);
-							Instruction *destBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
-							Instruction *sourceBitcastI = dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
-							IItoRemove.push_back(destBitcastI);
-							IItoRemove.push_back(sourceBitcastI);
-							IItoRemove.push_back(MemCpyI);
-						}
-					}
-
-				}
-			} else {
-				llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
-			}
-			//      llvm_unreachable("HERE!");
-		}
-
-		else if(CallInst* CI = dyn_cast<CallInst>(I)) {
-			DEBUG(errs() << "Found a call: " << *CI << "\n");
-			Function* calleeF = cast<Function>(CI->getCalledValue()->stripPointerCasts());
-			if(calleeF->isDeclaration()) {
-				// Add the declaration to kernel module
-				if (calleeF->getName() == "sqrtf") {
-					calleeF->setName(Twine("sqrt"));
-					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-					DEBUG(errs() << "CI: " << *CI << "\n");
-				} else if (calleeF->getName() == "rsqrtf") {
-					calleeF->setName(Twine("rsqrt"));
-					DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
-					DEBUG(errs() << "CI: " << *CI << "\n");
-				}  
-				DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF << "\n");
-				KernelM->getOrInsertFunction(calleeF->getName(), calleeF->getFunctionType());
-			}
-			else {
-				// Check if the called function has already been cloned before.
-				Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
-				// Iterate over the new function to see if it calls any other functions
-				// in the module.
-				for(inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc); i != e; ++i) {
-					if(auto *Call = dyn_cast<CallInst>(&*i)) {
-						Function *CalledFunc = cast<Function>(Call->getCalledValue()->stripPointerCasts());
-						CloneAndReplaceCall(Call, CalledFunc);
-					}
-				}
-			}
-			//TODO: how to handle address space qualifiers in load/store
-		}
-
-	}
-	// search for pattern where float is being casted to int and loaded/stored and change it.	
-	DEBUG(errs() << "finding pattern for replacement!\n");
-	for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e; ++i) {
-		bool cont = false;
-		bool keepGEPI = false;
-		bool keepGEPI2= false;
-		Instruction *I = &(*i);
-		GetElementPtrInst* GEPI = dyn_cast<GetElementPtrInst>(I);
-
-		if (!GEPI) {
-			// did nod find pattern start, continue
-			continue;
-		}
-		// may have found pattern, check
-		DEBUG(errs() << "GEPI " << *GEPI << "\n");
-		// print whatever we want for debug
-		Value* PtrOp = GEPI->getPointerOperand();
-		Type *SrcTy = GEPI->getSourceElementType();
-		unsigned GEPIaddrspace = GEPI->getAddressSpace();
-
-		if (SrcTy->isArrayTy()) 
-			DEBUG(errs() << *SrcTy << " is an array type! " << *(SrcTy->getArrayElementType()) << "\n");
-		else
-			DEBUG(errs() << *SrcTy << " is not an array type!\n");
-		// check that source element type is float
-		if (SrcTy->isArrayTy()) {
-			if (!(SrcTy->getArrayElementType()->isFloatTy())) {
-				DEBUG(errs() << "GEPI type is array but not float!\n");
-				continue;
-			}
-		}
-		else if (!(SrcTy->isFPOrFPVectorTy()/*isFloatTy()*/)) {
-			DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
-			// does not fit this pattern - no float GEP instruction
-			continue;
-		}
-		// check that addressspace is 1
-		//	  if (GEPIaddrspace != 1) {
-		//			// does not fit this pattern - addrspace of pointer argument is not global
-		//			continue;
-		//		}
-		if (!(GEPI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			//continue;
-			// Keep GEPI around if it has other uses
-			keepGEPI = true;
-		}
-		DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
-
-		// 1st GEPI it has one use
-		//		assert(GEPI->hasOneUse() && "GEPI has a single use");
-
-		// See if it is a bitcast
-		BitCastInst *BitCastI;
-		for (User * U : GEPI->users()) {
-			if(Instruction *ui = dyn_cast<Instruction> (U)) { 
-				DEBUG(errs() << "--" << *ui << "\n");
-				if (isa<BitCastInst>(ui)) {
-					BitCastI = dyn_cast<BitCastInst>(ui);
-					DEBUG(errs() << "---Found bitcast as only use of GEP\n");
-					break;
-				}
-			}
-			DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
-			cont = true;
-		}
-		//		for (Value::user_iterator ui = GEPI->user_begin(),
-		//				ue = GEPI->user_end(); ui!=ue; ++ui) {
-		//        DEBUG(errs() << "--" << *ui << "\n");
-		//			if (isa<BitCastInst>(*ui)) {
-		//				BitCastI = dyn_cast<BitCastInst>(*ui);
-		//        DEBUG(errs() << "Found bitcast as only use of GEP\n");
-		//			}
-		//		}
-
-		if (cont/*!BitCastI*/) {
-			continue; // not in pattern
-		}
-
-		//    DEBUG(errs() << *BitCastI << "\n");
-		// Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand has to be the GEP, since this is a use of the GEP.
-		Value *Op2 = BitCastI->getOperand(0);
-		DEBUG(errs() << "----" << *Op2 << "\n");
-		//		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
-		//		Type *OpTy = cast<Type>(Op2);
-		Type *OpTy = BitCastI->getDestTy();
-		DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
-		//    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) << "\n");
-		if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
-			// maybe right syntax is (Type::getInt32Ty)->getPointerTo()
-			continue; // not in pattern
-		}
-
-		DEBUG(errs() << "----Here!\n");
-		// We are in GEP, bitcast.
-
-		// user_iterator, to find the load.
-
-		if (!(BitCastI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			continue;
-		}
-		DEBUG(errs() << "----Bitcast has one use!\n");
-		// it has one use
-		assert(BitCastI->hasOneUse() && "BitCastI has a single use");
-		LoadInst *LoadI;
-		for (User * U : BitCastI->users()) { 
-			if (Instruction *ui = dyn_cast<Instruction> (U)) {
-				DEBUG(errs() << "-----" << *ui << "\n");
-				if (isa<LoadInst>(ui)) {
-					LoadI = dyn_cast<LoadInst>(ui);
-					DEBUG(errs() << "-----Found load as only use of bitcast\n");
-					break;
-				}
-			}
-			DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
-			cont = true;
-		}
-		//		for (Value::user_iterator ui = BitCastI->user_begin(),
-		//				ue = BitCastI->user_end(); ui!=ue; ++ui) {
-		//			if (isa<LoadInst>(*ui)) {
-		//				LoadI = dyn_cast<LoadInst>(*ui);
-		//        errs() << "Found load as only use of bitcast\n";
-		//			}
-		//		}
-
-		if (cont) {
-			continue; // not in pattern
-		}
-
-		DEBUG("HERE!\n");
-		// check that we load from pointer we got from bitcast - assert - the unique argument must be the use we found it from
-		assert(LoadI->getPointerOperand() == BitCastI && "Unexpected Load Instruction Operand\n");
-
-		// Copy user_iterator, to find the store.
-
-		if (!(LoadI->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			continue;
-			// TODO: generalize: one load can have more than one store users
-		}
-
-		// it has one use
-		assert(LoadI->hasOneUse() && "LoadI has a single use");
-		Value::user_iterator ui = LoadI->user_begin();
-		// skipped loop, because is has a single use
-		StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
-		if (!StoreI) {
-			continue; // not in pattern
-		}
-
-		// Also check that the store uses the loaded value as the value operand
-		if (StoreI->getValueOperand() != LoadI) {
-			continue;
-		}
-
-		DEBUG(errs() << "-------Found store instruction\n");
-
-		// Look for its bitcast, which is its pointer operand
-		Value *StPtrOp = StoreI->getPointerOperand();
-		DEBUG(errs() << "-------" << *StPtrOp << "\n");
-		BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
-		DEBUG(errs() << "-------" << *BitCastI2 << "\n");
-		if (!BitCastI2) {
-			continue; //not in pattern
-		}
-
-		DEBUG(errs() << "-------- Found Bit Cast of store!\n" );
-		// found bitcast. Look for the second GEP, its from operand.
-		Value *BCFromOp = BitCastI2->getOperand(0);
-		GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
-		DEBUG(errs() << "---------- " << *GEPI2 << "\n");
-		if (!GEPI2) {
-			continue; //not in pattern
-		}
-
-		if (!(GEPI2->hasOneUse())) {
-			// does not fit this pattern - more than one uses
-			//continue;
-			// Keep GEPI around if it has other uses
-			keepGEPI2 = true;
-		}
-		DEBUG(errs() << "---------- Found GEPI of Bitcast!\n"); 
-
-		Value *PtrOp2 = GEPI2->getPointerOperand();
-
-		// Found GEPI2. TODO: kind of confused as o what checks I need to add here, let's add them together- all the code for int-float type checks is already above.
-
-		// Assume we found pattern
-		if (!keepGEPI) {  
-			IItoRemove.push_back(GEPI);
-			DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
-		} else {
-			DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
-		}
-		IItoRemove.push_back(BitCastI);
-		DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
-		IItoRemove.push_back(LoadI);
-		DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
-		IItoRemove.push_back(GEPI2);
-		DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
-		IItoRemove.push_back(BitCastI2);
-		DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
-		if (!keepGEPI2) {
-			IItoRemove.push_back(StoreI);
-			DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
-		} else {
-
-			DEBUG(errs() << "Keeping " << *StoreI << " since it has multiple uses!\n");
-		}
-
-		std::vector<Value*> GEPlIndex;
-		if (GEPI->hasIndices()) {
-			for(auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
-				Value *Index = dyn_cast<Value>(&*ii);
-				DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
-				GEPlIndex.push_back(Index);
-			}
-		}
-		//    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
-
-		std::vector<Value*> GEPsIndex;
-		if (GEPI2->hasIndices()) {
-			for(auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
-				Value *Index = dyn_cast<Value>(&*ii);
-				DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
-				GEPsIndex.push_back(Index);
-			}
-		}
-		//    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
-
-
-
-		//    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
-		GetElementPtrInst* newlGEP =
-			GetElementPtrInst::Create(GEPI->getSourceElementType(), //Type::getFloatTy(M.getContext()),
-					PtrOp, // operand from 1st GEP
-					ArrayRef<Value*>(GEPlIndex),
-					Twine(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newlGEP << "\n");
-		// insert load before GEPI
-		LoadInst *newLoadI =
-			new LoadInst(Type::getFloatTy(M.getContext()),
-					newlGEP, // new GEP
-					Twine(),
-					LoadI->isVolatile(),
-					LoadI->getAlignment(),
-					LoadI->getOrdering(),
-					LoadI->getSyncScopeID(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newLoadI << "\n");
-		// same for GEP for store, for store operand
-		GetElementPtrInst* newsGEP =
-			GetElementPtrInst::Create(GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
-					PtrOp2, // operand from 2nd GEP
-					ArrayRef<Value*>(GEPsIndex),
-					Twine(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newsGEP << "\n");
-		// insert store before GEPI
-		StoreInst *newStoreI =
-			new StoreInst(newLoadI,
-					newsGEP, // new GEP
-					StoreI->isVolatile(),
-					StoreI->getAlignment(),
-					StoreI->getOrdering(),
-					StoreI->getSyncScopeID(),
-					StoreI);
-		DEBUG(errs() << "Adding: " << *newStoreI << "\n");
-
-	}
-
-	// We need to do this explicitly: DCE pass will not remove them because we
-	// have assumed theworst memory behaviour for these function calls
-	// Traverse the vector backwards, otherwise definitions are deleted while
-	// their subsequent uses are still around
-	for (auto *I : reverse(IItoRemove)) {
-		DEBUG(errs() << "Erasing: " << *I << "\n");
-		I->eraseFromParent();
-	}
-
-	// Removed the cloned functions from the parent module into the new module 
-	for(auto *F : FuncToBeRemoved) {
-		F->removeFromParent(); //TODO: MARIA check
-		KernelM->getFunctionList().push_back(F);
-	}
-
-	addCLMetadata(F_nvptx);
-	kernel->KernelFunction = F_nvptx;
-	DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName() << "\n");
-	DEBUG(errs() << *KernelM);
-
-	return;
-}
+    } else if (MemCpyInst *MemCpyI = dyn_cast<MemCpyInst>(I)) {
+      IRBuilder<> Builder(I);
+      Value *Source = MemCpyI->getSource();
+      Value *Destination = MemCpyI->getArgOperand(0)->stripPointerCasts();
+      Value *Length = MemCpyI->getOperand(2);
+      DEBUG(errs() << "Found memcpy instruction: " << *I << "\n");
+      DEBUG(errs() << "Source: " << *Source << "\n");
+      DEBUG(errs() << "Destination: " << *Destination << "\n");
+      DEBUG(errs() << "Length: " << *Length << "\n");
+
+      size_t memcpy_length;
+      unsigned int memcpy_count;
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Length)) {
+        if (CI->getBitWidth() <= 64) {
+          memcpy_length = CI->getSExtValue();
+          DEBUG(errs() << "Memcpy lenght = " << memcpy_length << "\n");
+          Type *Source_Type = Source->getType()->getPointerElementType();
+          DEBUG(errs() << "Source Type : " << *Source_Type << "\n");
+          memcpy_count =
+              memcpy_length / (Source_Type->getPrimitiveSizeInBits() / 8);
+          DEBUG(errs() << "Memcpy count = " << memcpy_count << "\n");
+          if (GetElementPtrInst *sourceGEPI =
+                  dyn_cast<GetElementPtrInst>(Source)) {
+            if (GetElementPtrInst *destGEPI =
+                    dyn_cast<GetElementPtrInst>(Destination)) {
+              Value *SourcePtrOperand = sourceGEPI->getPointerOperand();
+              Value *DestPtrOperand = destGEPI->getPointerOperand();
+              for (int i = 0; i < memcpy_count; ++i) {
+                Constant *increment;
+                LoadInst *newLoadI;
+                StoreInst *newStoreI;
+                // First, need to increment the correct index for both source
+                // and dest This invluves checking to see how many indeces the
+                // GEP has Assume for now only 1 or 2 are the viable options.
+
+                std::vector<Value *> GEPlIndex;
+                if (sourceGEPI->getNumIndices() == 1) {
+                  Value *Index = sourceGEPI->getOperand(1);
+                  increment = ConstantInt::get(Index->getType(), i, false);
+                  Value *incAdd = Builder.CreateAdd(Index, increment);
+                  DEBUG(errs() << "Add: " << *incAdd << "\n");
+                  GEPlIndex.push_back(incAdd);
+                  Value *newGEPIl = Builder.CreateGEP(
+                      SourcePtrOperand, ArrayRef<Value *>(GEPlIndex));
+                  DEBUG(errs() << "Load GEP: " << *newGEPIl << "\n");
+                  newLoadI = Builder.CreateLoad(newGEPIl);
+                  DEBUG(errs() << "Load: " << *newLoadI << "\n");
+                } else {
+                  llvm_unreachable("Unhandled case where source GEPI has more "
+                                   "than 1 indices!\n");
+                }
+
+                std::vector<Value *> GEPsIndex;
+                if (destGEPI->getNumIndices() == 1) {
+
+                } else if (destGEPI->getNumIndices() == 2) {
+                  Value *Index0 = destGEPI->getOperand(1);
+                  GEPsIndex.push_back(Index0);
+                  Value *Index1 = destGEPI->getOperand(2);
+                  increment = ConstantInt::get(Index1->getType(), i, false);
+                  Value *incAdd = Builder.CreateAdd(Index1, increment);
+                  DEBUG(errs() << "Add: " << *incAdd << "\n");
+                  GEPsIndex.push_back(incAdd);
+                  Value *newGEPIs = Builder.CreateGEP(
+                      DestPtrOperand, ArrayRef<Value *>(GEPsIndex));
+                  DEBUG(errs() << "Store GEP: " << *newGEPIs << "\n");
+                  newStoreI = Builder.CreateStore(newLoadI, newGEPIs,
+                                                  MemCpyI->isVolatile());
+                  DEBUG(errs() << "Store: " << *newStoreI << "\n");
+                } else {
+                  llvm_unreachable("Unhandled case where dest GEPI has more "
+                                   "than 2 indices!\n");
+                }
+              }
+              IItoRemove.push_back(sourceGEPI);
+              IItoRemove.push_back(destGEPI);
+              Instruction *destBitcastI =
+                  dyn_cast<Instruction>(MemCpyI->getArgOperand(0));
+              Instruction *sourceBitcastI =
+                  dyn_cast<Instruction>(MemCpyI->getArgOperand(1));
+              IItoRemove.push_back(destBitcastI);
+              IItoRemove.push_back(sourceBitcastI);
+              IItoRemove.push_back(MemCpyI);
+            }
+          }
+        }
+      } else {
+        llvm_unreachable("MEMCPY length is not a constant, not handled!\n");
+      }
+      //      llvm_unreachable("HERE!");
+    }
 
-bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
-	DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n");
+    else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      DEBUG(errs() << "Found a call: " << *CI << "\n");
+      Function *calleeF =
+          cast<Function>(CI->getCalledValue()->stripPointerCasts());
+      if (calleeF->isDeclaration()) {
+        // Add the declaration to kernel module
+        if (calleeF->getName() == "sqrtf") {
+          calleeF->setName(Twine("sqrt"));
+          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+          DEBUG(errs() << "CI: " << *CI << "\n");
+        } else if (calleeF->getName() == "rsqrtf") {
+          calleeF->setName(Twine("rsqrt"));
+          DEBUG(errs() << "CaleeF: " << *calleeF << "\n");
+          DEBUG(errs() << "CI: " << *CI << "\n");
+        }
+        DEBUG(errs() << "Adding declaration to Kernel module: " << *calleeF
+                     << "\n");
+        KernelM->getOrInsertFunction(calleeF->getName(),
+                                     calleeF->getFunctionType());
+      } else {
+        // Check if the called function has already been cloned before.
+        Function *NewFunc = CloneAndReplaceCall(CI, calleeF);
+        // Iterate over the new function to see if it calls any other functions
+        // in the module.
+        for (inst_iterator i = inst_begin(NewFunc), e = inst_end(NewFunc);
+             i != e; ++i) {
+          if (auto *Call = dyn_cast<CallInst>(&*i)) {
+            Function *CalledFunc =
+                cast<Function>(Call->getCalledValue()->stripPointerCasts());
+            CloneAndReplaceCall(Call, CalledFunc);
+          }
+        }
+      }
+      // TODO: how to handle address space qualifiers in load/store
+    }
+  }
+  // search for pattern where float is being casted to int and loaded/stored and
+  // change it.
+  DEBUG(errs() << "finding pattern for replacement!\n");
+  for (inst_iterator i = inst_begin(F_nvptx), e = inst_end(F_nvptx); i != e;
+       ++i) {
+    bool cont = false;
+    bool keepGEPI = false;
+    bool keepGEPI2 = false;
+    Instruction *I = &(*i);
+    GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I);
 
-	// Get the BuildDFG Analysis Results:
-	// - Dataflow graph
-	// - Maps from i8* hansles to DFNode and DFEdge
-	BuildDFG &DFG = getAnalysis<BuildDFG>();
+    if (!GEPI) {
+      // did nod find pattern start, continue
+      continue;
+    }
+    // may have found pattern, check
+    DEBUG(errs() << "GEPI " << *GEPI << "\n");
+    // print whatever we want for debug
+    Value *PtrOp = GEPI->getPointerOperand();
+    Type *SrcTy = GEPI->getSourceElementType();
+    unsigned GEPIaddrspace = GEPI->getAddressSpace();
+
+    if (SrcTy->isArrayTy())
+      DEBUG(errs() << *SrcTy << " is an array type! "
+                   << *(SrcTy->getArrayElementType()) << "\n");
+    else
+      DEBUG(errs() << *SrcTy << " is not an array type!\n");
+    // check that source element type is float
+    if (SrcTy->isArrayTy()) {
+      if (!(SrcTy->getArrayElementType()->isFloatTy())) {
+        DEBUG(errs() << "GEPI type is array but not float!\n");
+        continue;
+      }
+    } else if (!(SrcTy->isFPOrFPVectorTy() /*isFloatTy()*/)) {
+      DEBUG(errs() << "GEPI type is " << *SrcTy << "\n");
+      // does not fit this pattern - no float GEP instruction
+      continue;
+    }
+    // check that addressspace is 1
+    //	  if (GEPIaddrspace != 1) {
+    //			// does not fit this pattern - addrspace of pointer argument is
+    //not global 			continue;
+    //		}
+    if (!(GEPI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      // continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI = true;
+    }
+    DEBUG(errs() << "Found GEPI " << *GEPI << "\n");
+
+    // 1st GEPI it has one use
+    //		assert(GEPI->hasOneUse() && "GEPI has a single use");
+
+    // See if it is a bitcast
+    BitCastInst *BitCastI;
+    for (User *U : GEPI->users()) {
+      if (Instruction *ui = dyn_cast<Instruction>(U)) {
+        DEBUG(errs() << "--" << *ui << "\n");
+        if (isa<BitCastInst>(ui)) {
+          BitCastI = dyn_cast<BitCastInst>(ui);
+          DEBUG(errs() << "---Found bitcast as only use of GEP\n");
+          break;
+        }
+      }
+      DEBUG(errs() << "GEPI does not have a bitcast user, continue\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = GEPI->user_begin(),
+    //				ue = GEPI->user_end(); ui!=ue; ++ui) {
+    //        DEBUG(errs() << "--" << *ui << "\n");
+    //			if (isa<BitCastInst>(*ui)) {
+    //				BitCastI = dyn_cast<BitCastInst>(*ui);
+    //        DEBUG(errs() << "Found bitcast as only use of GEP\n");
+    //			}
+    //		}
+
+    if (cont /*!BitCastI*/) {
+      continue; // not in pattern
+    }
 
-	// DFInternalNode *Root = DFG.getRoot();
-	std::vector<DFInternalNode*> Roots = DFG.getRoots();
-	//    BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
-	//    BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
+    //    DEBUG(errs() << *BitCastI << "\n");
+    // Otherwise, check that first operand is GEP and 2nd is i32*. 1st Operand
+    // has to be the GEP, since this is a use of the GEP.
+    Value *Op2 = BitCastI->getOperand(0);
+    DEBUG(errs() << "----" << *Op2 << "\n");
+    //		assert(cast<Type>(Op2) && "Invalid Operand for Bitcast\n");
+    //		Type *OpTy = cast<Type>(Op2);
+    Type *OpTy = BitCastI->getDestTy();
+    DEBUG(errs() << "---- Bitcast destination type: " << *OpTy << "\n");
+    //    DEBUG(errs() << "---- " << *(Type::getInt32PtrTy(M.getContext(),1)) <<
+    //    "\n");
+    if (!(OpTy == Type::getInt32PtrTy(M.getContext(), GEPIaddrspace))) {
+      // maybe right syntax is (Type::getInt32Ty)->getPointerTo()
+      continue; // not in pattern
+    }
 
-	// Visitor for Code Generation Graph Traversal
-	CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+    DEBUG(errs() << "----Here!\n");
+    // We are in GEP, bitcast.
 
-	// Iterate over all the DFGs and produce code for each one of them
-	for (auto rootNode: Roots) {
-		// Initiate code generation for root DFNode
-		CGTVisitor->visit(rootNode);
-	}
+    // user_iterator, to find the load.
 
-	CGTVisitor->writeKernelsModule();
+    if (!(BitCastI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+    }
+    DEBUG(errs() << "----Bitcast has one use!\n");
+    // it has one use
+    assert(BitCastI->hasOneUse() && "BitCastI has a single use");
+    LoadInst *LoadI;
+    for (User *U : BitCastI->users()) {
+      if (Instruction *ui = dyn_cast<Instruction>(U)) {
+        DEBUG(errs() << "-----" << *ui << "\n");
+        if (isa<LoadInst>(ui)) {
+          LoadI = dyn_cast<LoadInst>(ui);
+          DEBUG(errs() << "-----Found load as only use of bitcast\n");
+          break;
+        }
+      }
+      DEBUG(errs() << "Bitcast does not have a load user, continue!\n");
+      cont = true;
+    }
+    //		for (Value::user_iterator ui = BitCastI->user_begin(),
+    //				ue = BitCastI->user_end(); ui!=ue; ++ui) {
+    //			if (isa<LoadInst>(*ui)) {
+    //				LoadI = dyn_cast<LoadInst>(*ui);
+    //        errs() << "Found load as only use of bitcast\n";
+    //			}
+    //		}
+
+    if (cont) {
+      continue; // not in pattern
+    }
 
-	//TODO: Edit module epilogue to remove the VISC intrinsic declarations
-	delete CGTVisitor;
+    DEBUG("HERE!\n");
+    // check that we load from pointer we got from bitcast - assert - the unique
+    // argument must be the use we found it from
+    assert(LoadI->getPointerOperand() == BitCastI &&
+           "Unexpected Load Instruction Operand\n");
 
-	return true;
-}
+    // Copy user_iterator, to find the store.
 
-std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
-	/*SmallString<128> currentDir;
-		llvm::sys::fs::current_path(currentDir);
-		std::string fileName = getFilenameFromModule(M);
-		Twine output = Twine(currentDir) + "/Output/" + fileName + "";
-		return output.str().append(".kernels.ll");*/
-	std::string mid = M.getModuleIdentifier();
-	return mid.append(".kernels.ll");
-}
+    if (!(LoadI->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      continue;
+      // TODO: generalize: one load can have more than one store users
+    }
 
-void CGT_NVPTX::fixValueAddrspace(Value* V, unsigned addrspace) {
-	assert(isa<PointerType>(V->getType())
-			&& "Value should be of Pointer Type!");
-	PointerType* OldTy = cast<PointerType>(V->getType());
-	PointerType* NewTy = PointerType::get(OldTy->getElementType(), addrspace);
-	V->mutateType(NewTy);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue; ui++) {
-		// Change all uses producing pointer type in same address space to new
-		// addressspace.
-		if(PointerType* PTy = dyn_cast<PointerType>((*ui)->getType())) {
-			if(PTy->getAddressSpace() == OldTy->getAddressSpace()) {
-				fixValueAddrspace(*ui, addrspace);
-			}
-		}
-	}
+    // it has one use
+    assert(LoadI->hasOneUse() && "LoadI has a single use");
+    Value::user_iterator ui = LoadI->user_begin();
+    // skipped loop, because is has a single use
+    StoreInst *StoreI = dyn_cast<StoreInst>(*ui);
+    if (!StoreI) {
+      continue; // not in pattern
+    }
+
+    // Also check that the store uses the loaded value as the value operand
+    if (StoreI->getValueOperand() != LoadI) {
+      continue;
+    }
+
+    DEBUG(errs() << "-------Found store instruction\n");
+
+    // Look for its bitcast, which is its pointer operand
+    Value *StPtrOp = StoreI->getPointerOperand();
+    DEBUG(errs() << "-------" << *StPtrOp << "\n");
+    BitCastInst *BitCastI2 = dyn_cast<BitCastInst>(StPtrOp);
+    DEBUG(errs() << "-------" << *BitCastI2 << "\n");
+    if (!BitCastI2) {
+      continue; // not in pattern
+    }
+
+    DEBUG(errs() << "-------- Found Bit Cast of store!\n");
+    // found bitcast. Look for the second GEP, its from operand.
+    Value *BCFromOp = BitCastI2->getOperand(0);
+    GetElementPtrInst *GEPI2 = dyn_cast<GetElementPtrInst>(BCFromOp);
+    DEBUG(errs() << "---------- " << *GEPI2 << "\n");
+    if (!GEPI2) {
+      continue; // not in pattern
+    }
+
+    if (!(GEPI2->hasOneUse())) {
+      // does not fit this pattern - more than one uses
+      // continue;
+      // Keep GEPI around if it has other uses
+      keepGEPI2 = true;
+    }
+    DEBUG(errs() << "---------- Found GEPI of Bitcast!\n");
+
+    Value *PtrOp2 = GEPI2->getPointerOperand();
+
+    // Found GEPI2. TODO: kind of confused as o what checks I need to add here,
+    // let's add them together- all the code for int-float type checks is
+    // already above.
+
+    // Assume we found pattern
+    if (!keepGEPI) {
+      IItoRemove.push_back(GEPI);
+      DEBUG(errs() << "Pushing " << *GEPI << " for removal\n");
+    } else {
+      DEBUG(errs() << "Keeping " << *GEPI << " since it has multiple uses!\n");
+    }
+    IItoRemove.push_back(BitCastI);
+    DEBUG(errs() << "Pushing " << *BitCastI << " for removal\n");
+    IItoRemove.push_back(LoadI);
+    DEBUG(errs() << "Pushing " << *LoadI << " for removal\n");
+    IItoRemove.push_back(GEPI2);
+    DEBUG(errs() << "Pushing " << *GEPI2 << " for removal\n");
+    IItoRemove.push_back(BitCastI2);
+    DEBUG(errs() << "Pushing " << *BitCastI2 << " for removal\n");
+    if (!keepGEPI2) {
+      IItoRemove.push_back(StoreI);
+      DEBUG(errs() << "Pushing " << *StoreI << " for removal\n");
+    } else {
+
+      DEBUG(errs() << "Keeping " << *StoreI
+                   << " since it has multiple uses!\n");
+    }
+
+    std::vector<Value *> GEPlIndex;
+    if (GEPI->hasIndices()) {
+      for (auto ii = GEPI->idx_begin(); ii != GEPI->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-1 Index: " << *Index << "\n");
+        GEPlIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPlArrayRef(GEPlIndex);
+
+    std::vector<Value *> GEPsIndex;
+    if (GEPI2->hasIndices()) {
+      for (auto ii = GEPI2->idx_begin(); ii != GEPI2->idx_end(); ++ii) {
+        Value *Index = dyn_cast<Value>(&*ii);
+        DEBUG(errs() << "GEP-2 Index: " << *Index << "\n");
+        GEPsIndex.push_back(Index);
+      }
+    }
+    //    ArrayRef<Value*> GEPsArrayRef(GEPlIndex);
+
+    //    ArrayRef<Value*>(GEPI->idx_begin(), GEPI->idx_end());
+    GetElementPtrInst *newlGEP = GetElementPtrInst::Create(
+        GEPI->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+        PtrOp,                        // operand from 1st GEP
+        ArrayRef<Value *>(GEPlIndex), Twine(), StoreI);
+    DEBUG(errs() << "Adding: " << *newlGEP << "\n");
+    // insert load before GEPI
+    LoadInst *newLoadI =
+        new LoadInst(Type::getFloatTy(M.getContext()),
+                     newlGEP, // new GEP
+                     Twine(), LoadI->isVolatile(), LoadI->getAlignment(),
+                     LoadI->getOrdering(), LoadI->getSyncScopeID(), StoreI);
+    DEBUG(errs() << "Adding: " << *newLoadI << "\n");
+    // same for GEP for store, for store operand
+    GetElementPtrInst *newsGEP = GetElementPtrInst::Create(
+        GEPI2->getSourceElementType(), // Type::getFloatTy(M.getContext()),
+        PtrOp2,                        // operand from 2nd GEP
+        ArrayRef<Value *>(GEPsIndex), Twine(), StoreI);
+    DEBUG(errs() << "Adding: " << *newsGEP << "\n");
+    // insert store before GEPI
+    StoreInst *newStoreI =
+        new StoreInst(newLoadI,
+                      newsGEP, // new GEP
+                      StoreI->isVolatile(), StoreI->getAlignment(),
+                      StoreI->getOrdering(), StoreI->getSyncScopeID(), StoreI);
+    DEBUG(errs() << "Adding: " << *newStoreI << "\n");
+  }
+
+  // We need to do this explicitly: DCE pass will not remove them because we
+  // have assumed theworst memory behaviour for these function calls
+  // Traverse the vector backwards, otherwise definitions are deleted while
+  // their subsequent uses are still around
+  for (auto *I : reverse(IItoRemove)) {
+    DEBUG(errs() << "Erasing: " << *I << "\n");
+    I->eraseFromParent();
+  }
+
+  // Removed the cloned functions from the parent module into the new module
+  for (auto *F : FuncToBeRemoved) {
+    F->removeFromParent(); // TODO: MARIA check
+    KernelM->getFunctionList().push_back(F);
+  }
+
+  addCLMetadata(F_nvptx);
+  kernel->KernelFunction = F_nvptx;
+  DEBUG(errs() << "Identified kernel - " << kernel->KernelFunction->getName()
+               << "\n");
+  DEBUG(errs() << *KernelM);
+
+  return;
 }
 
+bool DFG2LLVM_NVPTX::runOnModule(Module &M) {
+  DEBUG(errs() << "\nDFG2LLVM_NVPTX PASS\n");
+
+  // Get the BuildDFG Analysis Results:
+  // - Dataflow graph
+  // - Maps from i8* hansles to DFNode and DFEdge
+  BuildDFG &DFG = getAnalysis<BuildDFG>();
+
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
+  //    BuildDFG::HandleToDFNode &HandleToDFNodeMap =
+  //    DFG.getHandleToDFNodeMap(); BuildDFG::HandleToDFEdge &HandleToDFEdgeMap
+  //    = DFG.getHandleToDFEdgeMap();
+
+  // Visitor for Code Generation Graph Traversal
+  CGT_NVPTX *CGTVisitor = new CGT_NVPTX(M, DFG);
+
+  // Iterate over all the DFGs and produce code for each one of them
+  for (auto rootNode : Roots) {
+    // Initiate code generation for root DFNode
+    CGTVisitor->visit(rootNode);
+  }
 
-std::vector<unsigned> CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned>* GlobalMemArgs, Function* F) {
-	std::vector<unsigned> ConstantMemArgs;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument* arg = &*ai; 
-		std::vector<unsigned>::iterator pos = std::find(GlobalMemArgs->begin(),
-				GlobalMemArgs->end(), arg->getArgNo());
-		// It has to be a global memory argument to be promotable
-		if(pos == GlobalMemArgs->end())
-			continue;
-
-		// Check if it can/should be promoted
-		if(canBePromoted(arg, F)) {
-			DEBUG(errs() << "Promoting << " << arg->getName()  << " to constant memory."<< "\n");
-			ConstantMemArgs.push_back(arg->getArgNo());
-			GlobalMemArgs->erase(pos);
-		}
-	}
-	return ConstantMemArgs;
+  CGTVisitor->writeKernelsModule();
+
+  // TODO: Edit module epilogue to remove the VISC intrinsic declarations
+  delete CGTVisitor;
+
+  return true;
 }
 
-Function* CGT_NVPTX::changeArgAddrspace(Function* F, std::vector<unsigned> &Args, unsigned addrspace) {
-	unsigned idx = 0;
-	std::vector<Type*> ArgTypes;
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Argument *arg = &*ai;
-		DEBUG(errs() << *arg << "\n");
-		unsigned argno = arg->getArgNo();
-		if ((idx < Args.size()) && (argno == Args[idx])) {
-			fixValueAddrspace(arg, addrspace);
-			idx++;
-		}
-		ArgTypes.push_back(arg->getType());
-	}
-	FunctionType* newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
-
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-
-	DEBUG(errs() << *newF->getFunctionType() << "\n" <<*newF << "\n");
-	return newF;
+std::string CGT_NVPTX::getKernelsModuleName(Module &M) {
+  /*SmallString<128> currentDir;
+          llvm::sys::fs::current_path(currentDir);
+          std::string fileName = getFilenameFromModule(M);
+          Twine output = Twine(currentDir) + "/Output/" + fileName + "";
+          return output.str().append(".kernels.ll");*/
+  std::string mid = M.getModuleIdentifier();
+  return mid.append(".kernels.ll");
 }
 
-/* Add metadata to module KernelM, for OpenCL kernels */
-void CGT_NVPTX::addCLMetadata(Function *F) {
+void CGT_NVPTX::fixValueAddrspace(Value *V, unsigned addrspace) {
+  assert(isa<PointerType>(V->getType()) && "Value should be of Pointer Type!");
+  PointerType *OldTy = cast<PointerType>(V->getType());
+  PointerType *NewTy = PointerType::get(OldTy->getElementType(), addrspace);
+  V->mutateType(NewTy);
+  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
+       ui++) {
+    // Change all uses producing pointer type in same address space to new
+    // addressspace.
+    if (PointerType *PTy = dyn_cast<PointerType>((*ui)->getType())) {
+      if (PTy->getAddressSpace() == OldTy->getAddressSpace()) {
+        fixValueAddrspace(*ui, addrspace);
+      }
+    }
+  }
+}
 
-	IRBuilder<> Builder(&*F->begin());
+std::vector<unsigned>
+CGT_NVPTX::globalToConstantMemoryOpt(std::vector<unsigned> *GlobalMemArgs,
+                                     Function *F) {
+  std::vector<unsigned> ConstantMemArgs;
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Argument *arg = &*ai;
+    std::vector<unsigned>::iterator pos = std::find(
+        GlobalMemArgs->begin(), GlobalMemArgs->end(), arg->getArgNo());
+    // It has to be a global memory argument to be promotable
+    if (pos == GlobalMemArgs->end())
+      continue;
+
+    // Check if it can/should be promoted
+    if (canBePromoted(arg, F)) {
+      DEBUG(errs() << "Promoting << " << arg->getName()
+                   << " to constant memory."
+                   << "\n");
+      ConstantMemArgs.push_back(arg->getArgNo());
+      GlobalMemArgs->erase(pos);
+    }
+  }
+  return ConstantMemArgs;
+}
 
-	SmallVector<Metadata*,8> KernelMD;
-	KernelMD.push_back(ValueAsMetadata::get(F));
+Function *CGT_NVPTX::changeArgAddrspace(Function *F,
+                                        std::vector<unsigned> &Args,
+                                        unsigned addrspace) {
+  unsigned idx = 0;
+  std::vector<Type *> ArgTypes;
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Argument *arg = &*ai;
+    DEBUG(errs() << *arg << "\n");
+    unsigned argno = arg->getArgNo();
+    if ((idx < Args.size()) && (argno == Args[idx])) {
+      fixValueAddrspace(arg, addrspace);
+      idx++;
+    }
+    ArgTypes.push_back(arg->getType());
+  }
+  FunctionType *newFT = FunctionType::get(F->getReturnType(), ArgTypes, false);
 
-	// TODO: There is additional metadata used by kernel files but we skip them as
-	// they are not mandatory. In future they might be useful to enable
-	// optimizations
+  // F->mutateType(PTy);
+  Function *newF = cloneFunction(F, newFT, false);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
 
-	MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_kernels = KernelM->getOrInsertNamedMetadata("opencl.kernels");
-	MDN_kernels->addOperand(MDKernelNode);
+  DEBUG(errs() << *newF->getFunctionType() << "\n" << *newF << "\n");
+  return newF;
+}
 
-	KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
-	// TODO: Replace 1 with the number of the kernel.
-	// Add when support for multiple launces is added
-	KernelMD.push_back(ValueAsMetadata::get(ConstantInt::get(Type::getInt32Ty(KernelM->getContext()),1)));
-	MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
-	NamedMDNode *MDN_annotations = KernelM->getOrInsertNamedMetadata("nvvm.annotations");
-	MDN_annotations->addOperand(MDNvvmAnnotationsNode);
+/* Add metadata to module KernelM, for OpenCL kernels */
+void CGT_NVPTX::addCLMetadata(Function *F) {
 
+  IRBuilder<> Builder(&*F->begin());
+
+  SmallVector<Metadata *, 8> KernelMD;
+  KernelMD.push_back(ValueAsMetadata::get(F));
+
+  // TODO: There is additional metadata used by kernel files but we skip them as
+  // they are not mandatory. In future they might be useful to enable
+  // optimizations
+
+  MDTuple *MDKernelNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_kernels =
+      KernelM->getOrInsertNamedMetadata("opencl.kernels");
+  MDN_kernels->addOperand(MDKernelNode);
+
+  KernelMD.push_back(MDString::get(KernelM->getContext(), "kernel"));
+  // TODO: Replace 1 with the number of the kernel.
+  // Add when support for multiple launces is added
+  KernelMD.push_back(ValueAsMetadata::get(
+      ConstantInt::get(Type::getInt32Ty(KernelM->getContext()), 1)));
+  MDNode *MDNvvmAnnotationsNode = MDNode::get(KernelM->getContext(), KernelMD);
+  NamedMDNode *MDN_annotations =
+      KernelM->getOrInsertNamedMetadata("nvvm.annotations");
+  MDN_annotations->addOperand(MDNvvmAnnotationsNode);
 }
 
 void CGT_NVPTX::writeKernelsModule() {
 
-	// In addition to deleting all other functions, we also want to spiff it
-	// up a little bit.  Do this now.
-	legacy::PassManager Passes;
+  // In addition to deleting all other functions, we also want to spiff it
+  // up a little bit.  Do this now.
+  legacy::PassManager Passes;
 
   DEBUG(errs() << "Writing to File --- ");
   DEBUG(errs() << getKernelsModuleName(M).c_str() << "\n");
@@ -1996,105 +2029,103 @@ void CGT_NVPTX::writeKernelsModule() {
     DEBUG(errs() << EC.message() << '\n');
   }
 
-	Passes.add(
-			createPrintModulePass(Out.os()));
+  Passes.add(createPrintModulePass(Out.os()));
 
-	Passes.run(*KernelM);
+  Passes.run(*KernelM);
 
-	// Declare success.
-	Out.keep();
+  // Declare success.
+  Out.keep();
 }
 
-Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
-
-	DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
-	// FIXME: Maybe do that using the Node?
-	StructType* FRetTy = dyn_cast<StructType>(F->getReturnType());
-	assert(FRetTy && "Return Type must always be a struct");
-
-	// Keeps return statements, because we will need to replace them
-	std::vector<ReturnInst *> RItoRemove;
-	findReturnInst(F, RItoRemove);
-
-	std::vector<Type *> RetArgTypes;
-	std::vector<Argument*> RetArgs;
-	std::vector<Argument*> Args;
-	// Check for { } return struct, which means that the function returns void
-	if (FRetTy->isEmptyTy()) {
-
-		DEBUG(errs() << "\tFunction output struct is void\n");
-		DEBUG(errs() << "\tNo parameters added\n");
-
-		// Replacing return statements with others returning void
-		for (auto *RI : RItoRemove) {
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-		}
-		DEBUG(errs() << "\tChanged return statements to return void\n");
-	}
-	else {
-		// The struct has return values, thus needs to be converted to parameter
-
-		// Iterate over all element types of return struct and add arguments to the
-		// function
-		for (unsigned i=0; i<FRetTy->getNumElements(); i++) {
-			Argument* RetArg = new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
-			RetArgs.push_back(RetArg);
-			RetArgTypes.push_back(RetArg->getType());
-			DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
-		}
-
-		DEBUG(errs() << "\tReplacing Return statements\n");
-		// Replace return statements with extractValue and store instructions
-		for (auto *RI : RItoRemove) {
-			Value* RetVal = RI->getReturnValue();
-			for(unsigned i = 0; i < RetArgs.size(); i++) {
-				ExtractValueInst* EI = ExtractValueInst::Create(RetVal, ArrayRef<unsigned>(i),
-						RetArgs[i]->getName()+".val", RI);
-				new StoreInst(EI, RetArgs[i], RI);
-			}
-			// assert(RetVal && "Return value should not be null at this point");
-			// StructType* RetType = cast<StructType>(RetVal->getType());
-			// assert(RetType && "Return type is not a struct");
-
-			ReturnInst::Create((F->getContext()), 0, RI);
-			RI->eraseFromParent();
-
-		}
-	}
-	DEBUG(errs() << "\tReplaced return statements\n");
-
-	// Create the argument type list with the added argument's type
-	std::vector<Type*> ArgTypes;
-	for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		ArgTypes.push_back(ai->getType());
-	}
-	for(auto *RATy: RetArgTypes) {
-		ArgTypes.push_back(RATy);
-	}
-
-	// Creating Args vector to use in cloning!
-	for(Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-			ai != ae; ++ai) {
-		Args.push_back(&*ai);
-	}
-	for(auto *ai : RetArgs) {
-		Args.push_back(ai);
-	}
-
-	// Adding new arguments to the function argument list, would not change the
-	// function type. We need to change the type of this function to reflect the
-	// added arguments
-	Type* VoidRetType = Type::getVoidTy(F->getContext());
-	FunctionType* newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
-
-	// Change the function type
-	//F->mutateType(PTy);
-	Function* newF = cloneFunction(F, newFT, false, NULL, &Args);
-	replaceNodeFunctionInIR(*F->getParent(), F, newF);
-	//F->eraseFromParent();
-	return newF;
+Function *CGT_NVPTX::transformFunctionToVoid(Function *F) {
+
+  DEBUG(errs() << "Transforming function to void: " << F->getName() << "\n");
+  // FIXME: Maybe do that using the Node?
+  StructType *FRetTy = dyn_cast<StructType>(F->getReturnType());
+  assert(FRetTy && "Return Type must always be a struct");
+
+  // Keeps return statements, because we will need to replace them
+  std::vector<ReturnInst *> RItoRemove;
+  findReturnInst(F, RItoRemove);
+
+  std::vector<Type *> RetArgTypes;
+  std::vector<Argument *> RetArgs;
+  std::vector<Argument *> Args;
+  // Check for { } return struct, which means that the function returns void
+  if (FRetTy->isEmptyTy()) {
+
+    DEBUG(errs() << "\tFunction output struct is void\n");
+    DEBUG(errs() << "\tNo parameters added\n");
+
+    // Replacing return statements with others returning void
+    for (auto *RI : RItoRemove) {
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+    }
+    DEBUG(errs() << "\tChanged return statements to return void\n");
+  } else {
+    // The struct has return values, thus needs to be converted to parameter
+
+    // Iterate over all element types of return struct and add arguments to the
+    // function
+    for (unsigned i = 0; i < FRetTy->getNumElements(); i++) {
+      Argument *RetArg =
+          new Argument(FRetTy->getElementType(i)->getPointerTo(), "ret_arg", F);
+      RetArgs.push_back(RetArg);
+      RetArgTypes.push_back(RetArg->getType());
+      DEBUG(errs() << "\tCreated parameter: " << *RetArg << "\n");
+    }
+
+    DEBUG(errs() << "\tReplacing Return statements\n");
+    // Replace return statements with extractValue and store instructions
+    for (auto *RI : RItoRemove) {
+      Value *RetVal = RI->getReturnValue();
+      for (unsigned i = 0; i < RetArgs.size(); i++) {
+        ExtractValueInst *EI = ExtractValueInst::Create(
+            RetVal, ArrayRef<unsigned>(i), RetArgs[i]->getName() + ".val", RI);
+        new StoreInst(EI, RetArgs[i], RI);
+      }
+      // assert(RetVal && "Return value should not be null at this point");
+      // StructType* RetType = cast<StructType>(RetVal->getType());
+      // assert(RetType && "Return type is not a struct");
+
+      ReturnInst::Create((F->getContext()), 0, RI);
+      RI->eraseFromParent();
+    }
+  }
+  DEBUG(errs() << "\tReplaced return statements\n");
+
+  // Create the argument type list with the added argument's type
+  std::vector<Type *> ArgTypes;
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
+    ArgTypes.push_back(ai->getType());
+  }
+  for (auto *RATy : RetArgTypes) {
+    ArgTypes.push_back(RATy);
+  }
+
+  // Creating Args vector to use in cloning!
+  for (Function::arg_iterator ai = F->arg_begin(), ae = F->arg_end(); ai != ae;
+       ++ai) {
+    Args.push_back(&*ai);
+  }
+  for (auto *ai : RetArgs) {
+    Args.push_back(ai);
+  }
+
+  // Adding new arguments to the function argument list, would not change the
+  // function type. We need to change the type of this function to reflect the
+  // added arguments
+  Type *VoidRetType = Type::getVoidTy(F->getContext());
+  FunctionType *newFT = FunctionType::get(VoidRetType, ArgTypes, F->isVarArg());
+
+  // Change the function type
+  // F->mutateType(PTy);
+  Function *newF = cloneFunction(F, newFT, false, NULL, &Args);
+  replaceNodeFunctionInIR(*F->getParent(), F, newF);
+  // F->eraseFromParent();
+  return newF;
 }
 
 /******************************************************************************
@@ -2105,326 +2136,344 @@ Function* CGT_NVPTX::transformFunctionToVoid(Function* F) {
 // 1. No stores
 // 2. Loads not dependent on getNodeInstanceID itrinsic
 
-static bool findLoadStoreUses(Value* V, std::vector<Value*>*UseList, std::vector<Value*>*VisitedList) {
-	if(std::find(VisitedList->begin(), VisitedList->end(), V) != VisitedList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	VisitedList->push_back(V);
-	for(Value::user_iterator ui = V->user_begin(), ue = V->user_end();
-			ui != ue; ++ui) {
-		Instruction* I = dyn_cast<Instruction>(*ui);
-		if(!I) {
-			// if use is not an instruction, then skip it
-			continue;
-		}
-		DEBUG(errs() << "\t" << *I << "\n");
-		if(isa<LoadInst>(I)) {
-			DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
-			DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
-			UseList->push_back(V);
-		}
-		else if(isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
-			// found a store in use chain
-			DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
-			return true;
-		}
-		else if(BuildDFG::isViscIntrinsic(I)) {
-			// If it is an atomic intrinsic, we found a store
-			IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-			assert(II && II->getCalledValue()->getName().startswith("llvm.visc.atomic")
-					&& "Only visc atomic intrinsics can have an argument as input");
-			return true;
-		}
-		else {
-			DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
-			if(findLoadStoreUses(I, UseList, VisitedList))
-				return true;
-		}
-	}
-	return false;
+static bool findLoadStoreUses(Value *V, std::vector<Value *> *UseList,
+                              std::vector<Value *> *VisitedList) {
+  if (std::find(VisitedList->begin(), VisitedList->end(), V) !=
+      VisitedList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  VisitedList->push_back(V);
+  for (Value::user_iterator ui = V->user_begin(), ue = V->user_end(); ui != ue;
+       ++ui) {
+    Instruction *I = dyn_cast<Instruction>(*ui);
+    if (!I) {
+      // if use is not an instruction, then skip it
+      continue;
+    }
+    DEBUG(errs() << "\t" << *I << "\n");
+    if (isa<LoadInst>(I)) {
+      DEBUG(errs() << "\tFound load instruction: " << *I << "\n");
+      DEBUG(errs() << "\tAdd to use list: " << *V << "\n");
+      UseList->push_back(V);
+    } else if (isa<StoreInst>(I) || isa<AtomicRMWInst>(I)) {
+      // found a store in use chain
+      DEBUG(errs() << "Found store/atomicrmw instruction: " << *I << "\n");
+      return true;
+    } else if (BuildDFG::isViscIntrinsic(I)) {
+      // If it is an atomic intrinsic, we found a store
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+      assert(II &&
+             II->getCalledValue()->getName().startswith("llvm.visc.atomic") &&
+             "Only visc atomic intrinsics can have an argument as input");
+      return true;
+    } else {
+      DEBUG(errs() << "\tTraverse use chain of: " << *I << "\n");
+      if (findLoadStoreUses(I, UseList, VisitedList))
+        return true;
+    }
+  }
+  return false;
 }
 
-static bool isDependentOnNodeInstanceID(Value* V, std::vector<Value*>*DependenceList) {
-	if(std::find(DependenceList->begin(), DependenceList->end(), V) != DependenceList->end()) {
-		DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
-		return false;
-	}
-	DependenceList->push_back(V);
-	// If not an instruction, then not dependent on node instance id
-	if(!isa<Instruction>(V) || isa<Constant>(V)) {
-		DEBUG(errs() << "\tStop\n");
-		return false;
-	}
-
-	Instruction* I = cast<Instruction>(V);
-	for(unsigned i = 0; i < I->getNumOperands(); i++) {
-		Value* operand = I->getOperand(i);
-		if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(operand)) {
-			if((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y
-						|| II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
-				Value* Node = II->getArgOperand(0);
-				IntrinsicInst* GN = dyn_cast<IntrinsicInst>(Node);
-				assert(GN && "NodeInstanceID operande should be node/parent node intrinsic\n");
-				if(GN->getIntrinsicID() == Intrinsic::visc_getNode) {
-					DEBUG(errs() << "\tDependency found on Node instance ID: " << *II << "\n");
-					return true;
-				}
-			}
-		}
-		if(CmpInst* CI = dyn_cast<CmpInst>(operand)) {
-			DEBUG(errs() << "Found compare instruction: "<< *CI<<"\nNot following its dependency list\n");
-			continue;
-		}
-		DEBUG( errs() << "\tTraverse the operand chain of: " << *operand << "\n");
-		if(isDependentOnNodeInstanceID(operand, DependenceList)) {
-			return true;
-		}
-	}
-	return false;
+static bool isDependentOnNodeInstanceID(Value *V,
+                                        std::vector<Value *> *DependenceList) {
+  if (std::find(DependenceList->begin(), DependenceList->end(), V) !=
+      DependenceList->end()) {
+    DEBUG(errs() << "\tAlready visited value: " << *V << "\n");
+    return false;
+  }
+  DependenceList->push_back(V);
+  // If not an instruction, then not dependent on node instance id
+  if (!isa<Instruction>(V) || isa<Constant>(V)) {
+    DEBUG(errs() << "\tStop\n");
+    return false;
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  for (unsigned i = 0; i < I->getNumOperands(); i++) {
+    Value *operand = I->getOperand(i);
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(operand)) {
+      if ((II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_x ||
+           II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_y ||
+           II->getIntrinsicID() == Intrinsic::visc_getNodeInstanceID_z)) {
+        Value *Node = II->getArgOperand(0);
+        IntrinsicInst *GN = dyn_cast<IntrinsicInst>(Node);
+        assert(
+            GN &&
+            "NodeInstanceID operande should be node/parent node intrinsic\n");
+        if (GN->getIntrinsicID() == Intrinsic::visc_getNode) {
+          DEBUG(errs() << "\tDependency found on Node instance ID: " << *II
+                       << "\n");
+          return true;
+        }
+      }
+    }
+    if (CmpInst *CI = dyn_cast<CmpInst>(operand)) {
+      DEBUG(errs() << "Found compare instruction: " << *CI
+                   << "\nNot following its dependency list\n");
+      continue;
+    }
+    DEBUG(errs() << "\tTraverse the operand chain of: " << *operand << "\n");
+    if (isDependentOnNodeInstanceID(operand, DependenceList)) {
+      return true;
+    }
+  }
+  return false;
 }
 
 // Function to check if argument arg can be changed to a constant memory pointer
-static bool canBePromoted(Argument* arg, Function* F) {
-	DEBUG(errs() << "OPT: Check if Argument " << *arg << " can be changed to constant memory\n");
-	std::vector<Value*> UseList;
-	std::vector<Value*> VisitedList;
-	// recursively traverse use chain
-	// if find a store instruction return false, everything fails, cannot be
-	// promoted
-	// if find a load instruction as use, add the GEP instruction to list
-	bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
-	if(foundStore == true)
-		return false;
-	// See that the GEP instructions are not dependent on getNodeInstanceID
-	// intrinsic
-	DEBUG(errs() << foundStore << "\tNo Store Instruction found. Check dependence on node instance ID\n");
-	std::vector<Value*>DependenceList;
-	for(auto U: UseList) {
-		if(isDependentOnNodeInstanceID(U, &DependenceList))
-			return false;
-	}
-	DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
-	return true;
+static bool canBePromoted(Argument *arg, Function *F) {
+  DEBUG(errs() << "OPT: Check if Argument " << *arg
+               << " can be changed to constant memory\n");
+  std::vector<Value *> UseList;
+  std::vector<Value *> VisitedList;
+  // recursively traverse use chain
+  // if find a store instruction return false, everything fails, cannot be
+  // promoted
+  // if find a load instruction as use, add the GEP instruction to list
+  bool foundStore = findLoadStoreUses(arg, &UseList, &VisitedList);
+  if (foundStore == true)
+    return false;
+  // See that the GEP instructions are not dependent on getNodeInstanceID
+  // intrinsic
+  DEBUG(errs() << foundStore
+               << "\tNo Store Instruction found. Check dependence on node "
+                  "instance ID\n");
+  std::vector<Value *> DependenceList;
+  for (auto U : UseList) {
+    if (isDependentOnNodeInstanceID(U, &DependenceList))
+      return false;
+  }
+  DEBUG(errs() << "\tYes, Promotable to Constant Memory\n");
+  return true;
 }
 
-
 // Calculate execute node parameters which include, number of diemnsions for
 // dynamic instances of the kernel, local and global work group sizes.
-static void getExecuteNodeParams(Module &M, Value* &workDim, Value* &LocalWGPtr, Value*
-		&GlobalWGPtr, Kernel* kernel, ValueToValueMapTy& VMap, Instruction* IB) {
-
-	// Assign number of dimenstions a constant value
-	workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
-
-	// If local work group size if null
-	if(!kernel->hasLocalWG()) {
-		LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
-	}
-	else {
-		for(unsigned i = 0; i < kernel->localWGSize.size(); i++) {
-			if(isa<Argument>(kernel->localWGSize[i]))
-				kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
-		}
-		LocalWGPtr = genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
-	}
-
-	for(unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
-		if(isa<Argument>(kernel->globalWGSize[i]))
-			kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
-	}
-
-	// For OpenCL, global work group size is the total bumber of instances in each
-	// dimension. So, multiply local and global dim limits.
-	std::vector<Value*> globalWGSizeInsts;
-	if(kernel->hasLocalWG()) {
-		for (unsigned i = 0; i < kernel->gridDim; i++) {
-			BinaryOperator* MulInst = BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i], kernel->localWGSize[i], "", IB);
-			globalWGSizeInsts.push_back(MulInst);
-		}
-	}
-	else {
-		globalWGSizeInsts = kernel->globalWGSize;
-	}
-	GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
-	DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
+static void getExecuteNodeParams(Module &M, Value *&workDim, Value *&LocalWGPtr,
+                                 Value *&GlobalWGPtr, Kernel *kernel,
+                                 ValueToValueMapTy &VMap, Instruction *IB) {
+
+  // Assign number of dimenstions a constant value
+  workDim = ConstantInt::get(Type::getInt32Ty(M.getContext()), kernel->gridDim);
+
+  // If local work group size if null
+  if (!kernel->hasLocalWG()) {
+    LocalWGPtr = Constant::getNullValue(Type::getInt64PtrTy(M.getContext()));
+  } else {
+    for (unsigned i = 0; i < kernel->localWGSize.size(); i++) {
+      if (isa<Argument>(kernel->localWGSize[i]))
+        kernel->localWGSize[i] = VMap[kernel->localWGSize[i]];
+    }
+    LocalWGPtr =
+        genWorkGroupPtr(M, kernel->localWGSize, VMap, IB, "LocalWGSize");
+  }
+
+  for (unsigned i = 0; i < kernel->globalWGSize.size(); i++) {
+    if (isa<Argument>(kernel->globalWGSize[i]))
+      kernel->globalWGSize[i] = VMap[kernel->globalWGSize[i]];
+  }
+
+  // For OpenCL, global work group size is the total bumber of instances in each
+  // dimension. So, multiply local and global dim limits.
+  std::vector<Value *> globalWGSizeInsts;
+  if (kernel->hasLocalWG()) {
+    for (unsigned i = 0; i < kernel->gridDim; i++) {
+      BinaryOperator *MulInst =
+          BinaryOperator::Create(Instruction::Mul, kernel->globalWGSize[i],
+                                 kernel->localWGSize[i], "", IB);
+      globalWGSizeInsts.push_back(MulInst);
+    }
+  } else {
+    globalWGSizeInsts = kernel->globalWGSize;
+  }
+  GlobalWGPtr = genWorkGroupPtr(M, globalWGSizeInsts, VMap, IB, "GlobalWGSize");
+  DEBUG(errs() << "Pointer to global work group: " << *GlobalWGPtr << "\n");
 }
 
 // CodeGen for allocating space for Work Group on stack and returning a pointer
 // to its address
-static Value* genWorkGroupPtr(Module &M, std::vector<Value*> WGSize, ValueToValueMapTy& VMap, Instruction* IB, const Twine& WGName) {
-	Value* WGPtr;
-	// Get int64_t and or ease of use
-	Type* Int64Ty = Type::getInt64Ty(M.getContext());
-
-	// Work Group type is [#dim x i64]
-	Type* WGTy = ArrayType::get(Int64Ty, WGSize.size());
-	// Allocate space of Global work group data on stack and get pointer to
-	// first element.
-	AllocaInst* WG = new AllocaInst(WGTy, 0, WGName, IB);
-	WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(), WG->getName()+".0", IB);
-	Value* nextDim = WGPtr;
-	DEBUG(errs() << *WGPtr << "\n");
-
-	// Iterate over the number of dimensions and store the global work group
-	// size in that dimension
-	for(unsigned i=0; i < WGSize.size(); i++) {
-		DEBUG(errs() << *WGSize[i] << "\n");
-		assert(WGSize[i]->getType()->isIntegerTy() && "Dimension not an integer type!");
-
-		if(WGSize[i]->getType() != Int64Ty) {
-			// If number of dimensions are mentioned in any other integer format,
-			// generate code to extend it to i64. We need to use the mapped value in
-			// the new generated function, hence the use of VMap
-			// FIXME: Why are we changing the kernel WGSize vector here?
-			DEBUG(errs() << "Not i64. Zero extend required.\n");
-			DEBUG(errs() << *WGSize[i] << "\n");
-			CastInst* CI = BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
-			DEBUG(errs() << "Bitcast done.\n");
-			StoreInst* SI = new StoreInst(CI, nextDim, IB);
-			DEBUG(errs() << "Zero extend done.\n");
-			DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
-		} else {
-			// Store the value representing work group size in ith dimension on
-			// stack
-			StoreInst* SI = new StoreInst(WGSize[i], nextDim, IB);
-
-			DEBUG(errs() << "\t Work group size: " << *SI << "\n");
-		}
-		if(i+1 < WGSize.size()) {
-			// Move to next dimension
-			GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, nextDim,
-					ArrayRef<Value*>(ConstantInt::get(Int64Ty, 1)),
-					WG->getName()+"."+Twine(i+1),
-					IB);
-			DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
-			nextDim = GEP;
-		}
-	}
-	return WGPtr;
+static Value *genWorkGroupPtr(Module &M, std::vector<Value *> WGSize,
+                              ValueToValueMapTy &VMap, Instruction *IB,
+                              const Twine &WGName) {
+  Value *WGPtr;
+  // Get int64_t and or ease of use
+  Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+  // Work Group type is [#dim x i64]
+  Type *WGTy = ArrayType::get(Int64Ty, WGSize.size());
+  // Allocate space of Global work group data on stack and get pointer to
+  // first element.
+  AllocaInst *WG = new AllocaInst(WGTy, 0, WGName, IB);
+  WGPtr = BitCastInst::CreatePointerCast(WG, Int64Ty->getPointerTo(),
+                                         WG->getName() + ".0", IB);
+  Value *nextDim = WGPtr;
+  DEBUG(errs() << *WGPtr << "\n");
+
+  // Iterate over the number of dimensions and store the global work group
+  // size in that dimension
+  for (unsigned i = 0; i < WGSize.size(); i++) {
+    DEBUG(errs() << *WGSize[i] << "\n");
+    assert(WGSize[i]->getType()->isIntegerTy() &&
+           "Dimension not an integer type!");
+
+    if (WGSize[i]->getType() != Int64Ty) {
+      // If number of dimensions are mentioned in any other integer format,
+      // generate code to extend it to i64. We need to use the mapped value in
+      // the new generated function, hence the use of VMap
+      // FIXME: Why are we changing the kernel WGSize vector here?
+      DEBUG(errs() << "Not i64. Zero extend required.\n");
+      DEBUG(errs() << *WGSize[i] << "\n");
+      CastInst *CI =
+          BitCastInst::CreateIntegerCast(WGSize[i], Int64Ty, true, "", IB);
+      DEBUG(errs() << "Bitcast done.\n");
+      StoreInst *SI = new StoreInst(CI, nextDim, IB);
+      DEBUG(errs() << "Zero extend done.\n");
+      DEBUG(errs() << "\tZero extended work group size: " << *SI << "\n");
+    } else {
+      // Store the value representing work group size in ith dimension on
+      // stack
+      StoreInst *SI = new StoreInst(WGSize[i], nextDim, IB);
 
+      DEBUG(errs() << "\t Work group size: " << *SI << "\n");
+    }
+    if (i + 1 < WGSize.size()) {
+      // Move to next dimension
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(
+          nullptr, nextDim, ArrayRef<Value *>(ConstantInt::get(Int64Ty, 1)),
+          WG->getName() + "." + Twine(i + 1), IB);
+      DEBUG(errs() << "\tPointer to next dimension on stack: " << *GEP << "\n");
+      nextDim = GEP;
+    }
+  }
+  return WGPtr;
 }
 
 // Get generated PTX binary name
-static std::string getPTXFilename(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	moduleID.append(".kernels.cl");
-	return moduleID;
+static std::string getPTXFilename(const Module &M) {
+  std::string moduleID = M.getModuleIdentifier();
+  moduleID.append(".kernels.cl");
+  return moduleID;
 }
 
 // Get the name of the input file from module ID
-static std::string getFilenameFromModule(const Module& M) {
-	std::string moduleID = M.getModuleIdentifier();
-	return moduleID.substr(moduleID.find_last_of("/")+1);
+static std::string getFilenameFromModule(const Module &M) {
+  std::string moduleID = M.getModuleIdentifier();
+  return moduleID.substr(moduleID.find_last_of("/") + 1);
 }
 
 // Changes the data layout of the Module to be compiled with NVPTX backend
 // TODO: Figure out when to call it, probably after duplicating the modules
 static void changeDataLayout(Module &M) {
-	std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
-	std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx32_layoutStr = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64";
+  std::string nvptx64_layoutStr = "e-i64:64-v16:16-v32:32-n16:32:64";
 
-	if (TARGET_PTX == 32)
-		M.setDataLayout(StringRef(nvptx32_layoutStr));
-	else if (TARGET_PTX == 64)
-		M.setDataLayout(StringRef(nvptx64_layoutStr));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setDataLayout(StringRef(nvptx32_layoutStr));
+  else if (TARGET_PTX == 64)
+    M.setDataLayout(StringRef(nvptx64_layoutStr));
+  else
+    assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 static void changeTargetTriple(Module &M) {
-	std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
-	std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
+  std::string nvptx32_TargetTriple = "nvptx--nvidiacl";
+  std::string nvptx64_TargetTriple = "nvptx64--nvidiacl";
 
-	if (TARGET_PTX == 32)
-		M.setTargetTriple(StringRef(nvptx32_TargetTriple));
-	else if (TARGET_PTX == 64)
-		M.setTargetTriple(StringRef(nvptx64_TargetTriple));
-	else assert(false && "Invalid PTX target");
+  if (TARGET_PTX == 32)
+    M.setTargetTriple(StringRef(nvptx32_TargetTriple));
+  else if (TARGET_PTX == 64)
+    M.setTargetTriple(StringRef(nvptx64_TargetTriple));
+  else
+    assert(false && "Invalid PTX target");
 
-	return;
+  return;
 }
 
 // Helper function, populate a vector with all return statements in a function
-static void findReturnInst(Function* F, std::vector<ReturnInst *> & ReturnInstVec) {
-	for (auto &BB : *F) {
-		if(auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
-			ReturnInstVec.push_back(RI);
-	}	
+static void findReturnInst(Function *F,
+                           std::vector<ReturnInst *> &ReturnInstVec) {
+  for (auto &BB : *F) {
+    if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+      ReturnInstVec.push_back(RI);
+  }
 }
 
-// Helper function, populate a vector with all IntrinsicID intrinsics in a function
-static void findIntrinsicInst(Function* F, Intrinsic::ID IntrinsicID, std::vector<IntrinsicInst *> & IntrinsicInstVec) {
-	for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
-		Instruction *I = &(*i);
-		IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
-		if (II && II->getIntrinsicID() == IntrinsicID) {
-			IntrinsicInstVec.push_back(II);
-		}
-	}
+// Helper function, populate a vector with all IntrinsicID intrinsics in a
+// function
+static void findIntrinsicInst(Function *F, Intrinsic::ID IntrinsicID,
+                              std::vector<IntrinsicInst *> &IntrinsicInstVec) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+    Instruction *I = &(*i);
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (II && II->getIntrinsicID() == IntrinsicID) {
+      IntrinsicInstVec.push_back(II);
+    }
+  }
 }
 
-// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic op
+// Helper funtion, returns the atomicrmw op, corresponding to intrinsic atomic
+// op
 static AtomicRMWInst::BinOp getAtomicOp(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_add:
-			return AtomicRMWInst::Add;
-		case Intrinsic::visc_atomic_sub:
-			return AtomicRMWInst::Sub;
-		case Intrinsic::visc_atomic_min:
-			return AtomicRMWInst::Min;
-		case Intrinsic::visc_atomic_umin:
-			return AtomicRMWInst::UMin;
-		case Intrinsic::visc_atomic_max:
-			return AtomicRMWInst::Max;
-		case Intrinsic::visc_atomic_umax:
-			return AtomicRMWInst::UMax;
-			//case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
-			//case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
-		case Intrinsic::visc_atomic_xchg:
-			return AtomicRMWInst::Xchg;
-		case Intrinsic::visc_atomic_and:
-			return AtomicRMWInst::And;
-		case Intrinsic::visc_atomic_or:
-			return AtomicRMWInst::Or;
-		case Intrinsic::visc_atomic_xor:
-			return AtomicRMWInst::Xor;
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch (ID) {
+  case Intrinsic::visc_atomic_add:
+    return AtomicRMWInst::Add;
+  case Intrinsic::visc_atomic_sub:
+    return AtomicRMWInst::Sub;
+  case Intrinsic::visc_atomic_min:
+    return AtomicRMWInst::Min;
+  case Intrinsic::visc_atomic_umin:
+    return AtomicRMWInst::UMin;
+  case Intrinsic::visc_atomic_max:
+    return AtomicRMWInst::Max;
+  case Intrinsic::visc_atomic_umax:
+    return AtomicRMWInst::UMax;
+    // case Intrinsic::visc_atomic_inc: return AtomicRMWInst::Inc;
+    // case Intrinsic::visc_atomic_dec: return AtomicRMWInst::Dec;
+  case Intrinsic::visc_atomic_xchg:
+    return AtomicRMWInst::Xchg;
+  case Intrinsic::visc_atomic_and:
+    return AtomicRMWInst::And;
+  case Intrinsic::visc_atomic_or:
+    return AtomicRMWInst::Or;
+  case Intrinsic::visc_atomic_xor:
+    return AtomicRMWInst::Xor;
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
-
 // Helper funtion, returns the OpenCL function name, corresponding to atomic op
 static std::string getAtomicOpName(Intrinsic::ID ID) {
-	switch(ID) {
-		case Intrinsic::visc_atomic_cmpxchg:
-			return "atom_cmpxchg";
-		case Intrinsic::visc_atomic_add:
-			return "atom_add";
-		case Intrinsic::visc_atomic_sub:
-			return "atom_sub";
-		case Intrinsic::visc_atomic_min:
-			return "atom_min";
-		case Intrinsic::visc_atomic_max:
-			return "atom_max";
-		case Intrinsic::visc_atomic_inc:
-			return "atom_inc";
-		case Intrinsic::visc_atomic_dec:
-			return "atom_dec";
-		case Intrinsic::visc_atomic_xchg:
-			return "atom_xchg";
-		case Intrinsic::visc_atomic_and:
-			return "atom_and";
-		case Intrinsic::visc_atomic_or:
-			return "atom_or";
-		case Intrinsic::visc_atomic_xor:
-			return "atom_xor";
-		default:
-			llvm_unreachable("Unsupported atomic intrinsic!");
-	};
+  switch (ID) {
+  case Intrinsic::visc_atomic_cmpxchg:
+    return "atom_cmpxchg";
+  case Intrinsic::visc_atomic_add:
+    return "atom_add";
+  case Intrinsic::visc_atomic_sub:
+    return "atom_sub";
+  case Intrinsic::visc_atomic_min:
+    return "atom_min";
+  case Intrinsic::visc_atomic_max:
+    return "atom_max";
+  case Intrinsic::visc_atomic_inc:
+    return "atom_inc";
+  case Intrinsic::visc_atomic_dec:
+    return "atom_dec";
+  case Intrinsic::visc_atomic_xchg:
+    return "atom_xchg";
+  case Intrinsic::visc_atomic_and:
+    return "atom_and";
+  case Intrinsic::visc_atomic_or:
+    return "atom_or";
+  case Intrinsic::visc_atomic_xor:
+    return "atom_xor";
+  default:
+    llvm_unreachable("Unsupported atomic intrinsic!");
+  };
 }
 
 } // End of namespace
@@ -2435,4 +2484,3 @@ static RegisterPass<DFG2LLVM_NVPTX> X("dfg2llvm-nvptx",
 		false /* does not modify the CFG */,
 		true /* transformation,   *
 					* not just analysis */);
-
diff --git a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
index 90d7de11fa4a2e44be72360c0568ca63b6882b14..8ec14c80805c052b4a356df7b29b6f1cae2ab775 100644
--- a/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
+++ b/hpvm/lib/Transforms/DFG2LLVM_X86/DFG2LLVM_X86.cpp
@@ -8,31 +8,30 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "DFG2LLVM_X86"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
+#include "SupportVISC/DFG2LLVM.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Constant.h"
-#include "SupportVISC/DFG2LLVM.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 using namespace llvm;
 using namespace builddfg;
 using namespace dfg2llvm;
 
 // VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer_X86("visc-timers-x86", cl::desc("Enable visc timers"));
+static cl::opt<bool> VISCTimer_X86("visc-timers-x86",
+                                   cl::desc("Enable visc timers"));
 // Command line option to enable device abstraction or not
 static cl::opt<bool>
-DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
-                   cl::desc("Enable visc device abstraction"));
-
+    DeviceAbstraction("visc-eda", cl::init(false), cl::Hidden,
+                      cl::desc("Enable visc device abstraction"));
 
 namespace {
 
@@ -41,7 +40,8 @@ static bool isVISCCall_llvm_visc_policy_getVersion(Instruction *I) {
   if (!isa<CallInst>(I))
     return false;
   CallInst *CI = cast<CallInst>(I);
-  return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("llvm_visc_policy_getVersion");
+  return (CI->getCalledValue()->stripPointerCasts()->getName())
+      .equals("llvm_visc_policy_getVersion");
 }
 
 CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
@@ -56,7 +56,7 @@ CallInst *get_llvm_visc_policy_getVersion_call(Function *F) {
 // DFG2LLVM_X86 - The first implementation.
 struct DFG2LLVM_X86 : public DFG2LLVM {
   static char ID; // Pass identification, replacement for typeid
-  DFG2LLVM_X86() :DFG2LLVM(ID) {}
+  DFG2LLVM_X86() : DFG2LLVM(ID) {}
 
 private:
   // Member variables
@@ -71,7 +71,7 @@ public:
 class CGT_X86 : public CodeGenTraversal {
 
 private:
-  //Member variables
+  // Member variables
 
   FunctionCallee malloc;
   // VISC Runtime API
@@ -88,34 +88,35 @@ private:
   FunctionCallee llvm_visc_createEdgeBuffer;
   FunctionCallee llvm_visc_createLastInputBuffer;
   FunctionCallee llvm_visc_createThread;
-  //Constant* llvm_visc_freeThreads;
+  // Constant* llvm_visc_freeThreads;
   FunctionCallee llvm_visc_bufferPush;
   FunctionCallee llvm_visc_bufferPop;
   FunctionCallee llvm_visc_x86_dstack_push;
   FunctionCallee llvm_visc_x86_dstack_pop;
   FunctionCallee llvm_visc_x86_getDimLimit;
   FunctionCallee llvm_visc_x86_getDimInstance;
-  
-  //Functions
-  std::vector<IntrinsicInst*>* getUseList(Value* LI);
-  Value* addLoop(Instruction* I, Value* limit, const Twine& indexName = "");
-  void addWhileLoop(Instruction*, Instruction*, Instruction*, Value*);
+
+  // Functions
+  std::vector<IntrinsicInst *> *getUseList(Value *LI);
+  Value *addLoop(Instruction *I, Value *limit, const Twine &indexName = "");
+  void addWhileLoop(Instruction *, Instruction *, Instruction *, Value *);
   Instruction *addWhileLoopCounter(BasicBlock *, BasicBlock *, BasicBlock *);
-  Argument* getArgumentFromEnd(Function* F, unsigned offset);
-  Value* getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86,
-                      Instruction* InsertBefore);
-  void invokeChild_X86(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
-                       Instruction* InsertBefore);
-  void invokeChild_PTX(DFNode* C, Function* F_X86, ValueToValueMapTy &VMap,
-                       Instruction* InsertBefore);
-  StructType* getArgumentListStructTy(DFNode*);
-  Function* createFunctionFilter(DFNode* C);
-  void startNodeThread(DFNode*, std::vector<Value*>, DenseMap<DFEdge*, Value*>,
-                      Value*, Value*, Instruction*);
-  Function* createLaunchFunction(DFInternalNode*);
-  Function* createPushFunction(DFInternalNode*);
-  Function* createPopFunction(DFInternalNode*);
-  Function* createWaitFunction(DFInternalNode*);
+  Argument *getArgumentFromEnd(Function *F, unsigned offset);
+  Value *getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86,
+                      Instruction *InsertBefore);
+  void invokeChild_X86(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap,
+                       Instruction *InsertBefore);
+  void invokeChild_PTX(DFNode *C, Function *F_X86, ValueToValueMapTy &VMap,
+                       Instruction *InsertBefore);
+  StructType *getArgumentListStructTy(DFNode *);
+  Function *createFunctionFilter(DFNode *C);
+  void startNodeThread(DFNode *, std::vector<Value *>,
+                       DenseMap<DFEdge *, Value *>, Value *, Value *,
+                       Instruction *);
+  Function *createLaunchFunction(DFInternalNode *);
+  Function *createPushFunction(DFInternalNode *);
+  Function *createPopFunction(DFInternalNode *);
+  Function *createWaitFunction(DFInternalNode *);
 
   // Virtual Functions
   void init() {
@@ -123,10 +124,10 @@ private:
     TargetName = "X86";
   }
   void initRuntimeAPI();
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
-  Function* codeGenStreamPush(DFInternalNode* N);
-  Function* codeGenStreamPop(DFInternalNode* N);
+  void codeGen(DFInternalNode *N);
+  void codeGen(DFLeafNode *N);
+  Function *codeGenStreamPush(DFInternalNode *N);
+  Function *codeGenStreamPop(DFInternalNode *N);
 
 public:
   // Constructor
@@ -135,8 +136,8 @@ public:
     initRuntimeAPI();
   }
 
-  void codeGenLaunch(DFInternalNode* Root);
-  void codeGenLaunchStreaming(DFInternalNode* Root);
+  void codeGenLaunch(DFInternalNode *Root);
+  void codeGenLaunchStreaming(DFInternalNode *Root);
 };
 
 bool DFG2LLVM_X86::runOnModule(Module &M) {
@@ -147,8 +148,8 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   // - Maps from i8* hansles to DFNode and DFEdge
   BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-  //DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
@@ -156,16 +157,17 @@ bool DFG2LLVM_X86::runOnModule(Module &M) {
   CGT_X86 *CGTVisitor = new CGT_X86(M, DFG);
 
   // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode: Roots) {
+  for (auto rootNode : Roots) {
     // Initiate code generation for root DFNode
     CGTVisitor->visit(rootNode);
-    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise
+    // return now.
     // TODO: Later on, we might like to do this in a separate pass, which would
-    // allow us the flexibility to switch between complete static code generation
-    // for DFG or having a customized runtime+scheduler
-    
+    // allow us the flexibility to switch between complete static code
+    // generation for DFG or having a customized runtime+scheduler
+
     // Do streaming code generation if root node is streaming. Usual otherwise
-    if(rootNode->isChildGraphStreaming())
+    if (rootNode->isChildGraphStreaming())
       CGTVisitor->codeGenLaunchStreaming(rootNode);
     else
       CGTVisitor->codeGenLaunch(rootNode);
@@ -181,7 +183,7 @@ void CGT_X86::initRuntimeAPI() {
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
   assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
@@ -189,7 +191,7 @@ void CGT_X86::initRuntimeAPI() {
 
   runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
 
-  if(runtimeModule == NULL)
+  if (runtimeModule == NULL)
     DEBUG(errs() << Err.getMessage());
   else
     DEBUG(errs() << "Successfully loaded visc-rt API module\n");
@@ -208,7 +210,7 @@ void CGT_X86::initRuntimeAPI() {
   DECLARE(llvm_visc_createEdgeBuffer);
   DECLARE(llvm_visc_createLastInputBuffer);
   DECLARE(llvm_visc_createThread);
-  //DECLARE(llvm_visc_freeThreads);
+  // DECLARE(llvm_visc_freeThreads);
   DECLARE(llvm_visc_bufferPush);
   DECLARE(llvm_visc_bufferPop);
   DECLARE(llvm_visc_x86_dstack_push);
@@ -220,36 +222,40 @@ void CGT_X86::initRuntimeAPI() {
   initTimerAPI();
 
   // Insert init context in main
-  Function* VI = M.getFunction("llvm.visc.init");
+  Function *VI = M.getFunction("llvm.visc.init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
   DEBUG(errs() << "Inserting x86 timer initialization\n");
-  Instruction* I = cast<Instruction>(*VI->user_begin());
+  Instruction *I = cast<Instruction>(*VI->user_begin());
   initializeTimerSet(I);
   switchToTimer(visc_TimerID_NONE, I);
   // Insert code for initializing the sceduling policy
-  FunctionCallee IP = M.getOrInsertFunction("llvm_visc_policy_init",
-		      runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType());
-  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  FunctionCallee IP = M.getOrInsertFunction(
+      "llvm_visc_policy_init",
+      runtimeModule->getFunction("llvm_visc_policy_init")->getFunctionType());
+  CallInst *IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I);
   DEBUG(errs() << *IPCallInst << "\n");
 
   // If device abstraction is enabled, we add a runtime call to start the
   // device status simulation
   if (DeviceAbstraction) {
-    FunctionCallee ID = M.getOrInsertFunction("llvm_visc_deviceAbstraction_start",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")->getFunctionType());
-    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    FunctionCallee ID = M.getOrInsertFunction(
+        "llvm_visc_deviceAbstraction_start",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_start")
+            ->getFunctionType());
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I);
     DEBUG(errs() << *IDCallInst << "\n");
   }
 
   // Insert print instruction at visc exit
-  Function* VC = M.getFunction("llvm.visc.cleanup");
+  Function *VC = M.getFunction("llvm.visc.cleanup");
   assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
 
   // Insert code for clearing the sceduling policy
   I = cast<Instruction>(*VC->user_begin());
-  IP = M.getOrInsertFunction("llvm_visc_policy_clear",
-    runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType());
-  IPCallInst = CallInst::Create(IP, ArrayRef<Value*>(), "", I);
+  IP = M.getOrInsertFunction(
+      "llvm_visc_policy_clear",
+      runtimeModule->getFunction("llvm_visc_policy_clear")->getFunctionType());
+  IPCallInst = CallInst::Create(IP, ArrayRef<Value *>(), "", I);
   DEBUG(errs() << *IPCallInst << "\n");
 
   DEBUG(errs() << "Inserting x86 timer print\n");
@@ -258,22 +264,24 @@ void CGT_X86::initRuntimeAPI() {
   // If device abstraction is enabled, we add a runtime call to end the
   // device status simulation
   if (DeviceAbstraction) {
-    FunctionCallee ID = M.getOrInsertFunction("llvm_visc_deviceAbstraction_end",
-        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")->getFunctionType());
-    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value*>(), "", I);
+    FunctionCallee ID = M.getOrInsertFunction(
+        "llvm_visc_deviceAbstraction_end",
+        runtimeModule->getFunction("llvm_visc_deviceAbstraction_end")
+            ->getFunctionType());
+    CallInst *IDCallInst = CallInst::Create(ID, ArrayRef<Value *>(), "", I);
     DEBUG(errs() << *IDCallInst << "\n");
   }
-
 }
 
 /* Returns vector of all wait instructions
  */
-std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
-  std::vector<IntrinsicInst*>* UseList = new std::vector<IntrinsicInst*>();
+std::vector<IntrinsicInst *> *CGT_X86::getUseList(Value *GraphID) {
+  std::vector<IntrinsicInst *> *UseList = new std::vector<IntrinsicInst *>();
   // It must have been loaded from memory somewhere
-  for(Value::user_iterator ui = GraphID->user_begin(),
-      ue = GraphID->user_end(); ui!=ue; ++ui) {
-    if(IntrinsicInst* waitI = dyn_cast<IntrinsicInst>(*ui)) {
+  for (Value::user_iterator ui = GraphID->user_begin(),
+                            ue = GraphID->user_end();
+       ui != ue; ++ui) {
+    if (IntrinsicInst *waitI = dyn_cast<IntrinsicInst>(*ui)) {
       UseList->push_back(waitI);
     } else {
       llvm_unreachable("Error: Operation on Graph ID not supported!\n");
@@ -285,14 +293,14 @@ std::vector<IntrinsicInst*>* CGT_X86::getUseList(Value* GraphID) {
 /* Traverse the function argument list in reverse order to get argument at a
  * distance offset fromt he end of argument list of function F
  */
-Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
-  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0)
-         && "Invalid offset to access arguments!");
+Argument *CGT_X86::getArgumentFromEnd(Function *F, unsigned offset) {
+  assert((F->getFunctionType()->getNumParams() >= offset && offset > 0) &&
+         "Invalid offset to access arguments!");
   Function::arg_iterator e = F->arg_end();
   // Last element of argument iterator is dummy. Skip it.
   e--;
-  Argument* arg;
-  for( ; offset != 0; e--) {
+  Argument *arg;
+  for (; offset != 0; e--) {
     offset--;
     arg = &*e;
   }
@@ -310,25 +318,24 @@ Argument* CGT_X86::getArgumentFromEnd(Function* F, unsigned offset) {
  * which loops over bidy if true and goes to end if false
  * (5) Update phi node of body
  */
-void CGT_X86::addWhileLoop(Instruction* CondBlockStart, Instruction* BodyStart,
-                          Instruction* BodyEnd, Value* TerminationCond) {
-  BasicBlock* Entry = CondBlockStart->getParent();
-  BasicBlock* CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
-  BasicBlock* WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
-  BasicBlock* WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
+void CGT_X86::addWhileLoop(Instruction *CondBlockStart, Instruction *BodyStart,
+                           Instruction *BodyEnd, Value *TerminationCond) {
+  BasicBlock *Entry = CondBlockStart->getParent();
+  BasicBlock *CondBlock = Entry->splitBasicBlock(CondBlockStart, "condition");
+  BasicBlock *WhileBody = CondBlock->splitBasicBlock(BodyStart, "while.body");
+  BasicBlock *WhileEnd = WhileBody->splitBasicBlock(BodyEnd, "while.end");
 
   // Replace the terminator instruction of conditional with new conditional
   // branch which goes to while.body if true and branches to while.end otherwise
-  BranchInst* BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
+  BranchInst *BI = BranchInst::Create(WhileEnd, WhileBody, TerminationCond);
   ReplaceInstWithInst(CondBlock->getTerminator(), BI);
 
   // While Body should jump to condition block
-  BranchInst* UnconditionalBranch = BranchInst::Create(CondBlock);
+  BranchInst *UnconditionalBranch = BranchInst::Create(CondBlock);
   ReplaceInstWithInst(WhileBody->getTerminator(), UnconditionalBranch);
-
 }
 
-Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
+Instruction *CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
                                           BasicBlock *Body) {
   Module *M = Entry->getParent()->getParent();
   Type *Int64Ty = Type::getInt64Ty(M->getContext());
@@ -338,10 +345,10 @@ Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
   PHINode *CounterPhi = PHINode::Create(Int64Ty, 2, "cnt", IB);
 
   ConstantInt *IConst =
-    ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
+      ConstantInt::get(Type::getInt64Ty(M->getContext()), 1, true);
   Instruction *CounterIncr =
-    BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
-                                            "cnt_incr", Body->getTerminator());
+      BinaryOperator::CreateNSW(Instruction::BinaryOps::Add, CounterPhi, IConst,
+                                "cnt_incr", Body->getTerminator());
 
   // Set incoming values for Phi node
   IConst = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0, true);
@@ -363,39 +370,40 @@ Instruction* CGT_X86::addWhileLoopCounter(BasicBlock *Entry, BasicBlock *Cond,
  * which loops over bidy if true and goes to end if false
  * (5) Update phi node of body
  */
-Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
-  BasicBlock* Entry = I->getParent();
-  BasicBlock* ForBody = Entry->splitBasicBlock(I, "for.body");
+Value *CGT_X86::addLoop(Instruction *I, Value *limit, const Twine &indexName) {
+  BasicBlock *Entry = I->getParent();
+  BasicBlock *ForBody = Entry->splitBasicBlock(I, "for.body");
 
   BasicBlock::iterator i(I);
   ++i;
-  Instruction* NextI = &*i;
+  Instruction *NextI = &*i;
   // Next Instruction should also belong to the same basic block as the basic
   // block will have a terminator instruction
-  assert(NextI->getParent() == ForBody
-         && "Next Instruction should also belong to the same basic block!");
-  BasicBlock* ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
-
+  assert(NextI->getParent() == ForBody &&
+         "Next Instruction should also belong to the same basic block!");
+  BasicBlock *ForEnd = ForBody->splitBasicBlock(NextI, "for.end");
 
   // Add Phi Node for index variable
-  PHINode* IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()),
-                                      2, "index."+indexName, I);
+  PHINode *IndexPhi = PHINode::Create(Type::getInt64Ty(I->getContext()), 2,
+                                      "index." + indexName, I);
 
   // Add incoming edge to phi
   IndexPhi->addIncoming(ConstantInt::get(Type::getInt64Ty(I->getContext()), 0),
                         Entry);
   // Increment index variable
-  BinaryOperator* IndexInc = BinaryOperator::Create(Instruction::Add,
-                             IndexPhi, ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
-                             "index."+indexName+".inc", ForBody->getTerminator());
+  BinaryOperator *IndexInc = BinaryOperator::Create(
+      Instruction::Add, IndexPhi,
+      ConstantInt::get(Type::getInt64Ty(I->getContext()), 1),
+      "index." + indexName + ".inc", ForBody->getTerminator());
 
   // Compare index variable with limit
-  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc,
-                                  limit, "cond."+indexName, ForBody->getTerminator());
+  CmpInst *Cond =
+      CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, IndexInc, limit,
+                      "cond." + indexName, ForBody->getTerminator());
 
   // Replace the terminator instruction of for.body with new conditional
   // branch which loops over body if true and branches to for.end otherwise
-  BranchInst* BI = BranchInst::Create(ForBody, ForEnd, Cond);
+  BranchInst *BI = BranchInst::Create(ForBody, ForEnd, Cond);
   ReplaceInstWithInst(ForBody->getTerminator(), BI);
 
   // Add incoming edge to phi node in body
@@ -407,263 +415,274 @@ Value* CGT_X86::addLoop(Instruction* I, Value* limit, const Twine& indexName) {
 // types, output types and isLastInput buffer type. All the streaming
 // inputs/outputs are converted to i8*, since this is the type of buffer
 // handles.
-StructType* CGT_X86::getArgumentListStructTy(DFNode* C) {
-  std::vector<Type*> TyList;
+StructType *CGT_X86::getArgumentListStructTy(DFNode *C) {
+  std::vector<Type *> TyList;
   // Input types
-  Function* CF = C->getFuncPointer();
-  for(Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
-      ai != ae; ++ai) {
-    if(C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
+  Function *CF = C->getFuncPointer();
+  for (Function::arg_iterator ai = CF->arg_begin(), ae = CF->arg_end();
+       ai != ae; ++ai) {
+    if (C->getInDFEdgeAt(ai->getArgNo())->isStreamingEdge())
       TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
-    else 
+    else
       TyList.push_back(ai->getType());
   }
   // Output Types
-  StructType* OutStructTy = cast<StructType>(CF->getReturnType());
+  StructType *OutStructTy = cast<StructType>(CF->getReturnType());
   for (unsigned i = 0; i < OutStructTy->getNumElements(); i++) {
     // All outputs of a node are streaming edge
-    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() 
-        && "All output edges of child node have to be streaming");
+    assert(C->getOutDFEdgeAt(i)->isStreamingEdge() &&
+           "All output edges of child node have to be streaming");
     TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
   }
   // isLastInput buffer element
   TyList.push_back(Type::getInt8PtrTy(CF->getContext()));
 
-  StructType* STy = StructType::create(CF->getContext(), TyList,
-                        Twine("struct.thread."+CF->getName()).str(), true);
+  StructType *STy =
+      StructType::create(CF->getContext(), TyList,
+                         Twine("struct.thread." + CF->getName()).str(), true);
   return STy;
-
 }
 
-void CGT_X86::startNodeThread(DFNode* C, std::vector<Value*> Args, DenseMap<DFEdge*, Value*>
-                              EdgeBufferMap, Value* isLastInputBuffer, Value* graphID,
-                              Instruction* IB) {
-  DEBUG(errs() << "Starting Pipeline for child node: " << C->getFuncPointer()->getName() << "\n");
+void CGT_X86::startNodeThread(DFNode *C, std::vector<Value *> Args,
+                              DenseMap<DFEdge *, Value *> EdgeBufferMap,
+                              Value *isLastInputBuffer, Value *graphID,
+                              Instruction *IB) {
+  DEBUG(errs() << "Starting Pipeline for child node: "
+               << C->getFuncPointer()->getName() << "\n");
   // Create a filter/pipeline function for the child node
-  Function* C_Pipeline = createFunctionFilter(C);
-  Function* CF = C->getFuncPointer();
+  Function *C_Pipeline = createFunctionFilter(C);
+  Function *CF = C->getFuncPointer();
 
   // Get module context and i32 0 constant, as they would be frequently used in
   // this function.
-  LLVMContext& Ctx = IB->getParent()->getContext();
-  Constant* IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+  LLVMContext &Ctx = IB->getParent()->getContext();
+  Constant *IntZero = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
 
   // Marshall arguments
   // Create a packed struct type with inputs of C followed by outputs and then
   // another i8* to indicate isLastInput buffer. Streaming inputs are replaced
   // by i8*
   //
-  StructType* STy = getArgumentListStructTy(C);
+  StructType *STy = getArgumentListStructTy(C);
   // Allocate the struct on heap *NOT* stack and bitcast i8* to STy*
-  CallInst* CI = CallInst::Create(malloc, ArrayRef<Value*>(ConstantExpr::getSizeOf(STy)),
-                                  C->getFuncPointer()->getName()+".inputs", IB);
-  CastInst* Struct = BitCastInst::CreatePointerCast(CI, STy->getPointerTo(), CI->getName()+".i8ptr", IB);
-  //AllocaInst* AI = new AllocaInst(STy, C->getFuncPointer()->getName()+".inputs", IB);
+  CallInst *CI =
+      CallInst::Create(malloc, ArrayRef<Value *>(ConstantExpr::getSizeOf(STy)),
+                       C->getFuncPointer()->getName() + ".inputs", IB);
+  CastInst *Struct = BitCastInst::CreatePointerCast(
+      CI, STy->getPointerTo(), CI->getName() + ".i8ptr", IB);
+  // AllocaInst* AI = new AllocaInst(STy,
+  // C->getFuncPointer()->getName()+".inputs", IB);
   // Insert elements in the struct
-  DEBUG(errs() << "Marshall inputs for child node: " << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << "Marshall inputs for child node: "
+               << C->getFuncPointer()->getName() << "\n");
   // Marshall Inputs
-  for(unsigned i=0; i < CF->getFunctionType()->getNumParams(); i++) {
+  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
     // Create constant int (i)
-    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
+    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i);
     // Get Element pointer instruction
-    Value* GEPIndices[] = { IntZero, Int_i };
-    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                             ArrayRef<Value*>(GEPIndices, 2),
-                             Struct->getName()+".arg_"+Twine(i),
-                             IB);
-    DFEdge* E = C->getInDFEdgeAt(i);
+    Value *GEPIndices[] = {IntZero, Int_i};
+    GetElementPtrInst *GEP = GetElementPtrInst::Create(
+        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+        Struct->getName() + ".arg_" + Twine(i), IB);
+    DFEdge *E = C->getInDFEdgeAt(i);
     if (E->getSourceDF()->isEntryNode()) {
       // This is a Bind Input Edge
-      if(E->isStreamingEdge()) {
+      if (E->isStreamingEdge()) {
         // Streaming Bind Input edge. Get buffer corresponding to it
-        assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming Bind DFEdge!");
+        assert(EdgeBufferMap.count(E) &&
+               "No mapping buffer for a Streaming Bind DFEdge!");
         new StoreInst(EdgeBufferMap[E], GEP, IB);
-      }
-      else {
+      } else {
         // Non-streaming Bind edge
         new StoreInst(Args[i], GEP, IB);
       }
-    }
-    else {
-      // This is an edge between siblings. 
+    } else {
+      // This is an edge between siblings.
       // This must be an streaming edge. As it is our assumption that all edges
       // between two nodes in a DFG are streaming.
-      assert(EdgeBufferMap.count(E) && "No mapping buffer for a Streaming DFEdge!");
+      assert(EdgeBufferMap.count(E) &&
+             "No mapping buffer for a Streaming DFEdge!");
       new StoreInst(EdgeBufferMap[E], GEP, IB);
     }
   }
   unsigned numInputs = CF->getFunctionType()->getNumParams();
   unsigned numOutputs = cast<StructType>(CF->getReturnType())->getNumElements();
   // Marshall Outputs
-  DEBUG(errs() << "Marshall outputs for child node: " << C->getFuncPointer()->getName() << "\n");
-  for(unsigned i = 0; i < numOutputs; i++ ) {
+  DEBUG(errs() << "Marshall outputs for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  for (unsigned i = 0; i < numOutputs; i++) {
     // Create constant int (i+numInputs)
-    Constant* Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i+numInputs);
+    Constant *Int_i = ConstantInt::get(Type::getInt32Ty(Ctx), i + numInputs);
     // Get Element pointer instruction
-    Value* GEPIndices[] = { IntZero, Int_i };
-    GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                             ArrayRef<Value*>(GEPIndices, 2),
-                             Struct->getName()+".out_"+Twine(i),
-                             IB);
-    DFEdge* E = C->getOutDFEdgeAt(i);
-    assert(E->isStreamingEdge() && "Output Edge must be streaming of all nodes");
-    assert(EdgeBufferMap.count(E) && "No mapping buffer for a Out Streaming DFEdge!");
+    Value *GEPIndices[] = {IntZero, Int_i};
+    GetElementPtrInst *GEP = GetElementPtrInst::Create(
+        nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+        Struct->getName() + ".out_" + Twine(i), IB);
+    DFEdge *E = C->getOutDFEdgeAt(i);
+    assert(E->isStreamingEdge() &&
+           "Output Edge must be streaming of all nodes");
+    assert(EdgeBufferMap.count(E) &&
+           "No mapping buffer for a Out Streaming DFEdge!");
     new StoreInst(EdgeBufferMap[E], GEP, IB);
   }
   // Marshall last argument. isLastInput buffer
-  DEBUG(errs() << "Marshall isLastInput for child node: " << C->getFuncPointer()->getName() << "\n");
+  DEBUG(errs() << "Marshall isLastInput for child node: "
+               << C->getFuncPointer()->getName() << "\n");
   // Create constant int (i+numInputs)
-  Constant* Int_index = ConstantInt::get(Type::getInt32Ty(Ctx), numInputs+numOutputs);
+  Constant *Int_index =
+      ConstantInt::get(Type::getInt32Ty(Ctx), numInputs + numOutputs);
   // Get Element pointer instruction
-  Value* GEPIndices[] = { IntZero, Int_index };
-  GetElementPtrInst* GEP = GetElementPtrInst::Create(nullptr, Struct,
-                           ArrayRef<Value*>(GEPIndices, 2),
-                           Struct->getName()+".isLastInput", IB);
+  Value *GEPIndices[] = {IntZero, Int_index};
+  GetElementPtrInst *GEP = GetElementPtrInst::Create(
+      nullptr, Struct, ArrayRef<Value *>(GEPIndices, 2),
+      Struct->getName() + ".isLastInput", IB);
   new StoreInst(isLastInputBuffer, GEP, IB);
 
   // AllocaInst AI points to memory with all the arguments packed
   // Call runtime to create the thread with these arguments
-  DEBUG(errs() << "Start Thread for child node: " << C->getFuncPointer()->getName() << "\n");
-// DEBUG(errs() << *llvm_visc_createThread << "\n");
+  DEBUG(errs() << "Start Thread for child node: "
+               << C->getFuncPointer()->getName() << "\n");
+  // DEBUG(errs() << *llvm_visc_createThread << "\n");
   DEBUG(errs() << *graphID->getType() << "\n");
   DEBUG(errs() << *C_Pipeline->getType() << "\n");
   DEBUG(errs() << *Struct->getType() << "\n");
   // Bitcast AI to i8*
-  CastInst* BI  = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx), Struct->getName(), IB);
-  Value* CreateThreadArgs[] = {graphID, C_Pipeline, BI};
-  CallInst* CreateThread = CallInst::Create(llvm_visc_createThread,
-                                            ArrayRef<Value*>(CreateThreadArgs, 3),
-                                            "",
-                                            IB);
-
+  CastInst *BI = BitCastInst::CreatePointerCast(Struct, Type::getInt8PtrTy(Ctx),
+                                                Struct->getName(), IB);
+  Value *CreateThreadArgs[] = {graphID, C_Pipeline, BI};
+  CallInst *CreateThread = CallInst::Create(
+      llvm_visc_createThread, ArrayRef<Value *>(CreateThreadArgs, 3), "", IB);
 }
 
-Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
+Function *CGT_X86::createLaunchFunction(DFInternalNode *N) {
   DEBUG(errs() << "Generating Streaming Launch Function\n");
   // Get Function associated with Node N
-  Function* NF = N->getFuncPointer();
+  Function *NF = N->getFuncPointer();
 
-  // Map from Streaming edge to buffer 
-  DenseMap<DFEdge*, Value*> EdgeBufferMap;
+  // Map from Streaming edge to buffer
+  DenseMap<DFEdge *, Value *> EdgeBufferMap;
 
   /* Now we have all the necessary global declarations necessary to generate the
-  * Launch function, pointer to which can be passed to pthread utils to execute
-  * DFG. The Launch function has just one input: i8* data.addr
-  * This is the address of the all the input data that needs to be passed to
-  * this function. In our case it contains the input arguments of the Root
-  * function in the correct order.
-  * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
-  * (2) Extract each of inputs from data.addr
-  * (3) create Buffers for all the streaming edges
-  *     - Put buffers in the context
-  * (4) Go over each child node
-  *     - marshall its arguments together (use buffers in place of streaming
-  *       arguments)
-  *     - Start the threads
-  * (5) The return value from Root is stored in memory, pointer to which is
-  * passed to pthread_exit call.
-  */
+   * Launch function, pointer to which can be passed to pthread utils to execute
+   * DFG. The Launch function has just one input: i8* data.addr
+   * This is the address of the all the input data that needs to be passed to
+   * this function. In our case it contains the input arguments of the Root
+   * function in the correct order.
+   * (1) Create an empty Launch function of type void (i8* args, i8* GraphID)
+   * (2) Extract each of inputs from data.addr
+   * (3) create Buffers for all the streaming edges
+   *     - Put buffers in the context
+   * (4) Go over each child node
+   *     - marshall its arguments together (use buffers in place of streaming
+   *       arguments)
+   *     - Start the threads
+   * (5) The return value from Root is stored in memory, pointer to which is
+   * passed to pthread_exit call.
+   */
   // (1) Create Launch Function of type void (i8* args, i8* GraphID)
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  Type* ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
-  FunctionType* LaunchFuncTy = FunctionType::get(Type::getVoidTy(NF->getContext()),
-                                  ArrayRef<Type*>(ArgTypes, 2), false);
-  Function* LaunchFunc = Function::Create(LaunchFuncTy,
-                                       NF->getLinkage(),
-                                       NF->getName()+".LaunchFunction",
-                                       &M);
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  Type *ArgTypes[] = {i8Ty->getPointerTo(), i8Ty->getPointerTo()};
+  FunctionType *LaunchFuncTy = FunctionType::get(
+      Type::getVoidTy(NF->getContext()), ArrayRef<Type *>(ArgTypes, 2), false);
+  Function *LaunchFunc = Function::Create(
+      LaunchFuncTy, NF->getLinkage(), NF->getName() + ".LaunchFunction", &M);
   DEBUG(errs() << "Generating Code for Streaming Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Argument* data = &*LaunchFunc->arg_begin();
+  Argument *data = &*LaunchFunc->arg_begin();
   // NOTE-HS: Check correctness with Maria
-  Argument* graphID = &*(LaunchFunc->arg_begin() + 1);
+  Argument *graphID = &*(LaunchFunc->arg_begin() + 1);
   data->setName("data.addr");
   graphID->setName("graphID");
   // Add a basic block to this empty function and a return null statement to it
   DEBUG(errs() << *LaunchFunc->getReturnType() << "\n");
-  BasicBlock *BB = BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
-  ReturnInst* RI = ReturnInst::Create(LaunchFunc->getContext(),
-                                      BB);
+  BasicBlock *BB =
+      BasicBlock::Create(LaunchFunc->getContext(), "entry", LaunchFunc);
+  ReturnInst *RI = ReturnInst::Create(LaunchFunc->getContext(), BB);
 
   DEBUG(errs() << "Created Empty Launch Function\n");
 
   // (2) Extract each of inputs from data.addr
-  std::vector<Type*> TyList;
+  std::vector<Type *> TyList;
   std::vector<std::string> names;
-  std::vector<Value*> Args;
+  std::vector<Value *> Args;
 
   for (Function::arg_iterator ai = NF->arg_begin(), ae = NF->arg_end();
-      ai != ae; ++ai) {
-    if(N->getChildGraph()->getEntry()->getOutDFEdgeAt(ai->getArgNo())->isStreamingEdge()) {
+       ai != ae; ++ai) {
+    if (N->getChildGraph()
+            ->getEntry()
+            ->getOutDFEdgeAt(ai->getArgNo())
+            ->isStreamingEdge()) {
       TyList.push_back(i8Ty->getPointerTo());
-      names.push_back(Twine(ai->getName()+"_buffer").str());
+      names.push_back(Twine(ai->getName() + "_buffer").str());
       continue;
     }
     TyList.push_back(ai->getType());
     names.push_back(ai->getName());
   }
   Args = extractElements(data, TyList, names, RI);
-  DEBUG(errs() <<  "Launch function for " << NF->getName() << *LaunchFunc << "\n");
+  DEBUG(errs() << "Launch function for " << NF->getName() << *LaunchFunc
+               << "\n");
   // (3) Create buffers for all the streaming edges
-  for(DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
-      de = N->getChildGraph()->dfedge_end(); di != de; ++di) {
-    DFEdge* Edge = *di;
+  for (DFGraph::dfedge_iterator di = N->getChildGraph()->dfedge_begin(),
+                                de = N->getChildGraph()->dfedge_end();
+       di != de; ++di) {
+    DFEdge *Edge = *di;
     DEBUG(errs() << *Edge->getType() << "\n");
-    Value* size = ConstantExpr::getSizeOf(Edge->getType());
-    Value* CallArgs[] = {graphID, size};
+    Value *size = ConstantExpr::getSizeOf(Edge->getType());
+    Value *CallArgs[] = {graphID, size};
     if (Edge->isStreamingEdge()) {
-      CallInst* CI;
+      CallInst *CI;
       // Create a buffer call
-      if(Edge->getSourceDF()->isEntryNode()) {
+      if (Edge->getSourceDF()->isEntryNode()) {
         // Bind Input Edge
-        Constant* Int_ArgNo = ConstantInt::get(Type::getInt32Ty(RI->getContext()),
-                                  Edge->getSourcePosition());
-        Value* BindInCallArgs[] = {graphID, size, Int_ArgNo};
-        CI = CallInst::Create(llvm_visc_createBindInBuffer, ArrayRef<Value*>(BindInCallArgs, 3),
-                              "BindIn."+Edge->getDestDF()->getFuncPointer()->getName(),
-                              RI);
-      }
-      else if(Edge->getDestDF()->isExitNode()) {
+        Constant *Int_ArgNo = ConstantInt::get(
+            Type::getInt32Ty(RI->getContext()), Edge->getSourcePosition());
+        Value *BindInCallArgs[] = {graphID, size, Int_ArgNo};
+        CI = CallInst::Create(
+            llvm_visc_createBindInBuffer, ArrayRef<Value *>(BindInCallArgs, 3),
+            "BindIn." + Edge->getDestDF()->getFuncPointer()->getName(), RI);
+      } else if (Edge->getDestDF()->isExitNode()) {
         // Bind Output Edge
-        CI = CallInst::Create(llvm_visc_createBindOutBuffer, ArrayRef<Value*>(CallArgs, 2),
-                              "BindOut."+Edge->getSourceDF()->getFuncPointer()->getName(),
-                              RI);
-      }
-      else {
+        CI = CallInst::Create(
+            llvm_visc_createBindOutBuffer, ArrayRef<Value *>(CallArgs, 2),
+            "BindOut." + Edge->getSourceDF()->getFuncPointer()->getName(), RI);
+      } else {
         // Streaming Edge
-        CI = CallInst::Create(llvm_visc_createEdgeBuffer,
-                              ArrayRef<Value*>(CallArgs, 2),
-                              Edge->getSourceDF()->getFuncPointer()->getName()+"."
-                              +Edge->getDestDF()->getFuncPointer()->getName(),
-                              RI);
+        CI = CallInst::Create(
+            llvm_visc_createEdgeBuffer, ArrayRef<Value *>(CallArgs, 2),
+            Edge->getSourceDF()->getFuncPointer()->getName() + "." +
+                Edge->getDestDF()->getFuncPointer()->getName(),
+            RI);
       }
       EdgeBufferMap[Edge] = CI;
     }
   }
   // Create buffer for isLastInput for all the child nodes
-  DFGraph* G = N->getChildGraph();
-  DenseMap<DFNode*, Value*> NodeLastInputMap;
-  for(DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce; ++ci) {
-    DFNode* child = *ci;
-    if(child->isDummyNode())
+  DFGraph *G = N->getChildGraph();
+  DenseMap<DFNode *, Value *> NodeLastInputMap;
+  for (DFGraph::children_iterator ci = G->begin(), ce = G->end(); ci != ce;
+       ++ci) {
+    DFNode *child = *ci;
+    if (child->isDummyNode())
       continue;
-    Value* size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
-    Value* CallArgs[] = {graphID, size};
-    CallInst* CI = CallInst::Create(llvm_visc_createLastInputBuffer, ArrayRef<Value*>(CallArgs, 2),
-                              "BindIn.isLastInput."+child->getFuncPointer()->getName(),
-                              RI);
+    Value *size = ConstantExpr::getSizeOf(Type::getInt64Ty(NF->getContext()));
+    Value *CallArgs[] = {graphID, size};
+    CallInst *CI = CallInst::Create(
+        llvm_visc_createLastInputBuffer, ArrayRef<Value *>(CallArgs, 2),
+        "BindIn.isLastInput." + child->getFuncPointer()->getName(), RI);
     NodeLastInputMap[child] = CI;
   }
-  DEBUG(errs() <<  "Start Each child node filter\n");
+  DEBUG(errs() << "Start Each child node filter\n");
   // (4) Marshall arguments for each child node and start the thread with its
   //     pipeline funtion
-  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-    DFNode* C = *ci;
+  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                  ce = N->getChildGraph()->end();
+       ci != ce; ++ci) {
+    DFNode *C = *ci;
     // Skip dummy node call
     if (C->isDummyNode())
       continue;
-    
+
     // Marshall all the arguments for this node into an i8*
     // Pass to the runtime to create the thread
     // Start the thread for child node C
@@ -676,22 +695,21 @@ Function* CGT_X86::createLaunchFunction(DFInternalNode* N) {
   return LaunchFunc;
 }
 
-
-Function* CGT_X86::createPushFunction(DFInternalNode* N) {
+Function *CGT_X86::createPushFunction(DFInternalNode *N) {
   DEBUG(errs() << "Generating Push function\n");
-  Function* PushFunc;
+  Function *PushFunc;
   return PushFunc;
 }
 
-Function* CGT_X86::createPopFunction(DFInternalNode* N) {
+Function *CGT_X86::createPopFunction(DFInternalNode *N) {
   DEBUG(errs() << "Generating Pop function\n");
-  Function* PushFunc;
+  Function *PushFunc;
   return PushFunc;
 }
 
-Function* CGT_X86::createWaitFunction(DFInternalNode* N) {
+Function *CGT_X86::createWaitFunction(DFInternalNode *N) {
   DEBUG(errs() << "Generating Wait function\n");
-  Function* PushFunc;
+  Function *PushFunc;
   return PushFunc;
 }
 /* This fuction does the steps necessary to launch a streaming graph
@@ -701,171 +719,162 @@ Function* CGT_X86::createWaitFunction(DFInternalNode* N) {
  * Modify each of the instrinsic in host code
  * Launch, Push, Pop, Wait
  */
-void CGT_X86::codeGenLaunchStreaming(DFInternalNode* Root) {
-  IntrinsicInst* LI = Root->getInstruction();
-  Function* RootLaunch = createLaunchFunction(Root);
-  //Function* RootPush = createPushFunction(Root);
-  //Function* RootPop = createPopFunction(Root);
-  //Function* RootWait = createWaitFunction(Root);
+void CGT_X86::codeGenLaunchStreaming(DFInternalNode *Root) {
+  IntrinsicInst *LI = Root->getInstruction();
+  Function *RootLaunch = createLaunchFunction(Root);
+  // Function* RootPush = createPushFunction(Root);
+  // Function* RootPop = createPopFunction(Root);
+  // Function* RootWait = createWaitFunction(Root);
   // Substitute launch intrinsic main
-  DEBUG(errs() <<  "Substitute launch intrinsic\n");
-  Value* LaunchInstArgs[] = {RootLaunch,
-                             LI->getArgOperand(1)
-                            };
-  CallInst* LaunchInst = CallInst::Create(llvm_visc_streamLaunch,
-                                          ArrayRef<Value*>(LaunchInstArgs,2),
-                                          "graph"+Root->getFuncPointer()->getName(), LI);
-  //ReplaceInstWithInst(LI, LaunchInst);
+  DEBUG(errs() << "Substitute launch intrinsic\n");
+  Value *LaunchInstArgs[] = {RootLaunch, LI->getArgOperand(1)};
+  CallInst *LaunchInst = CallInst::Create(
+      llvm_visc_streamLaunch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      "graph" + Root->getFuncPointer()->getName(), LI);
+  // ReplaceInstWithInst(LI, LaunchInst);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  DEBUG(errs() <<  "Substitute wait, push, pop intrinsics\n");
-  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
-  for(unsigned i=0; i < UseList->size(); ++i) {
-    IntrinsicInst* II = UseList->at(i);
-    CallInst* CI;
-    Value* PushArgs[] = {LaunchInst, II->getOperand(1)};
-    switch(II->getIntrinsicID()) {
+  DEBUG(errs() << "Substitute wait, push, pop intrinsics\n");
+  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
+  for (unsigned i = 0; i < UseList->size(); ++i) {
+    IntrinsicInst *II = UseList->at(i);
+    CallInst *CI;
+    Value *PushArgs[] = {LaunchInst, II->getOperand(1)};
+    switch (II->getIntrinsicID()) {
     case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_streamWait,
-                            ArrayRef<Value*>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_streamWait, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     case Intrinsic::visc_push:
       CI = CallInst::Create(llvm_visc_streamPush,
-                            ArrayRef<Value*>(PushArgs, 2),
-                            "");
+                            ArrayRef<Value *>(PushArgs, 2), "");
       break;
     case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_streamPop,
-                            ArrayRef<Value*>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_streamPop, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     default:
-      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+      llvm_unreachable(
+          "GraphID is used by an instruction other than wait, push, pop");
     };
     DEBUG(errs() << "Replace:\n\t" << *II << "\n");
     ReplaceInstWithInst(II, CI);
     DEBUG(errs() << "\twith " << *CI << "\n");
   }
-
-
 }
 
-void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
+void CGT_X86::codeGenLaunch(DFInternalNode *Root) {
   // TODO: Place an assert to check if the constant passed by launch intrinsic
   // as the number of arguments to DFG is same as the number of arguments of the
   // root of DFG
   DEBUG(errs() << "Generating Launch Function\n");
   // Get Launch Instruction
-  IntrinsicInst* LI = Root->getInstruction();
+  IntrinsicInst *LI = Root->getInstruction();
   switchToTimer(visc_TimerID_PTHREAD_CREATE, LI);
   DEBUG(errs() << "Generating Launch Function\n");
 
   /* Now we have all the necessary global declarations necessary to generate the
-  * Launch function, pointer to which can be passed to pthread utils to execute
-  * DFG. The Launch function has just one input: i8* data.addr
-  * This is the address of the all the input data that needs to be passed to
-  * this function. In our case it contains the input arguments of the Root
-  * function in the correct order.
-  * (1) Create an empty Launch function of type i8*(i8*)
-  * (2) Extract each of inputs from data.addr and pass them as arguments to the
-  * call to Root function
-  * (3) The return value from Root is stored in memory, pointer to which is
-  * passed to pthread_exit call.
-  */
+   * Launch function, pointer to which can be passed to pthread utils to execute
+   * DFG. The Launch function has just one input: i8* data.addr
+   * This is the address of the all the input data that needs to be passed to
+   * this function. In our case it contains the input arguments of the Root
+   * function in the correct order.
+   * (1) Create an empty Launch function of type i8*(i8*)
+   * (2) Extract each of inputs from data.addr and pass them as arguments to the
+   * call to Root function
+   * (3) The return value from Root is stored in memory, pointer to which is
+   * passed to pthread_exit call.
+   */
   // Create Launch Function of type i8*(i8*) which calls the root function
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType* AppFuncTy = FunctionType::get(i8Ty->getPointerTo(),
-                            ArrayRef<Type*>(i8Ty->getPointerTo()),
-                            false);
-  Function* AppFunc = Function::Create(AppFuncTy,
-                                       Root->getFuncPointer()->getLinkage(),
-                                       "LaunchDataflowGraph",
-                                       &M);
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType *AppFuncTy = FunctionType::get(
+      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
+  Function *AppFunc =
+      Function::Create(AppFuncTy, Root->getFuncPointer()->getLinkage(),
+                       "LaunchDataflowGraph", &M);
   DEBUG(errs() << "Generating Launch Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Value* data = &*AppFunc->arg_begin();
+  Value *data = &*AppFunc->arg_begin();
   data->setName("data.addr");
   // Add a basic block to this empty function and a return null statement to it
   BasicBlock *BB = BasicBlock::Create(AppFunc->getContext(), "entry", AppFunc);
-  ReturnInst* RI = ReturnInst::Create(AppFunc->getContext(),
-                                      Constant::getNullValue(AppFunc->getReturnType()),
-                                      BB);
+  ReturnInst *RI =
+      ReturnInst::Create(AppFunc->getContext(),
+                         Constant::getNullValue(AppFunc->getReturnType()), BB);
   switchToTimer(visc_TimerID_ARG_UNPACK, RI);
 
   DEBUG(errs() << "Created Empty Launch Function\n");
   // Find the X86 function generated for Root and
-//  Function* RootF_X86 = Root->getGenFunc();
-  Function* RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
+  //  Function* RootF_X86 = Root->getGenFunc();
+  Function *RootF_X86 = Root->getGenFuncForTarget(visc::CPU_TARGET);
   assert(RootF_X86 && "Error: No generated CPU function for Root node\n");
   assert(Root->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
          "Error: Generated Function for Root node with no x86 wrapper\n");
 
   // Generate a call to RootF_X86 with null parameters for now
-  std::vector<Value*>Args;
-  for(unsigned i=0; i< RootF_X86->getFunctionType()->getNumParams(); i++) {
-    Args.push_back(Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
+  std::vector<Value *> Args;
+  for (unsigned i = 0; i < RootF_X86->getFunctionType()->getNumParams(); i++) {
+    Args.push_back(
+        Constant::getNullValue(RootF_X86->getFunctionType()->getParamType(i)));
   }
-  CallInst* CI = CallInst::Create(RootF_X86, Args, RootF_X86->getName()+".output", RI);
+  CallInst *CI =
+      CallInst::Create(RootF_X86, Args, RootF_X86->getName() + ".output", RI);
 
   // Extract input data from i8* data.addr and patch them to correct argument of
   // call to RootF_X86. For each argument
-  std::vector<Type*> TyList;
+  std::vector<Type *> TyList;
   std::vector<std::string> names;
-  for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
-      ai != ae; ++ai) {
+  for (Function::arg_iterator ai = RootF_X86->arg_begin(),
+                              ae = RootF_X86->arg_end();
+       ai != ae; ++ai) {
     TyList.push_back(ai->getType());
     names.push_back(ai->getName());
   }
-  std::vector<Value*> elements = extractElements(data, TyList, names, CI);
+  std::vector<Value *> elements = extractElements(data, TyList, names, CI);
   // Patch the elements to the call arguments
-  for(unsigned i=0; i<CI->getNumArgOperands(); i++)
+  for (unsigned i = 0; i < CI->getNumArgOperands(); i++)
     CI->setArgOperand(i, elements[i]);
 
   // Add timers around Call to RootF_X86 function
   switchToTimer(visc_TimerID_COMPUTATION, CI);
   switchToTimer(visc_TimerID_OUTPUT_PACK, RI);
 
-  StructType *RootRetTy = cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
+  StructType *RootRetTy =
+      cast<StructType>(RootF_X86->getFunctionType()->getReturnType());
 
-  // if Root has non empty return 
+  // if Root has non empty return
   if (RootRetTy->getNumElements()) {
     // We can't access the type of the arg struct - build it
-    std::vector<Type*> TyList;
-    for(Function::arg_iterator ai = RootF_X86->arg_begin(), ae = RootF_X86->arg_end();
-        ai != ae; ++ai) {
+    std::vector<Type *> TyList;
+    for (Function::arg_iterator ai = RootF_X86->arg_begin(),
+                                ae = RootF_X86->arg_end();
+         ai != ae; ++ai) {
       TyList.push_back(ai->getType());
     }
     TyList.push_back(CI->getType());
 
-    StructType* ArgStructTy = StructType::create(M.getContext(),
-                                                 ArrayRef<Type*>(TyList),
-                                 (RootF_X86->getName()+".arg.struct.ty").str(), true);
+    StructType *ArgStructTy = StructType::create(
+        M.getContext(), ArrayRef<Type *>(TyList),
+        (RootF_X86->getName() + ".arg.struct.ty").str(), true);
 
     // Cast the data pointer to the type of the arg struct
-    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
-                                 ArgStructTy->getPointerTo(),
-                                 "argStructCast.addr",
-                                 RI);
+    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
+        data, ArgStructTy->getPointerTo(), "argStructCast.addr", RI);
 
     // Result struct is the last element of the packed struct passed to launch
     unsigned outStructIdx = ArgStructTy->getNumElements() - 1;
 
-    ConstantInt *IntZero = ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
-    ConstantInt *IntIdx = ConstantInt::get(Type::getInt32Ty(M.getContext()),
-                                          outStructIdx);
+    ConstantInt *IntZero =
+        ConstantInt::get(Type::getInt32Ty(M.getContext()), 0);
+    ConstantInt *IntIdx =
+        ConstantInt::get(Type::getInt32Ty(M.getContext()), outStructIdx);
 
-    Value* GEPIIdxList[] = { IntZero,
-                             IntIdx
-                           };
+    Value *GEPIIdxList[] = {IntZero, IntIdx};
     // Get data pointer to the last element of struct - result field
-    GetElementPtrInst *OutGEPI =
-      GetElementPtrInst::Create(ArgStructTy,
-                                OutputAddrCast,
-                                ArrayRef<Value*>(GEPIIdxList, 2),
-                                CI->getName()+".addr",
-                                RI);
+    GetElementPtrInst *OutGEPI = GetElementPtrInst::Create(
+        ArgStructTy, OutputAddrCast, ArrayRef<Value *>(GEPIIdxList, 2),
+        CI->getName() + ".addr", RI);
     // Store result there
     new StoreInst(CI, OutGEPI, RI);
   } else {
@@ -874,10 +883,8 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
     // We were casting the data pointer to the result type of Root, and
     // returning result there. This would work at the LLVM level, but not
     // at the C level, thus the rewrite.
-    CastInst* OutputAddrCast = CastInst::CreatePointerCast(data,
-                               CI->getType()->getPointerTo(),
-                               CI->getName()+".addr",
-                               RI);
+    CastInst *OutputAddrCast = CastInst::CreatePointerCast(
+        data, CI->getType()->getPointerTo(), CI->getName() + ".addr", RI);
     new StoreInst(CI, OutputAddrCast, RI);
   }
 
@@ -887,104 +894,100 @@ void CGT_X86::codeGenLaunch(DFInternalNode* Root) {
   DEBUG(errs() << *AppFunc << "\n");
 
   // Substitute launch intrinsic main
-  Value* LaunchInstArgs[] = {AppFunc,
-                             LI->getArgOperand(1)
-                            };
-  CallInst* LaunchInst = CallInst::Create(llvm_visc_x86_launch,
-                                          ArrayRef<Value*>(LaunchInstArgs,2),
-                                          "graph"+Root->getFuncPointer()->getName(), LI);
-  //ReplaceInstWithInst(LI, LaunchInst);
+  Value *LaunchInstArgs[] = {AppFunc, LI->getArgOperand(1)};
+  CallInst *LaunchInst = CallInst::Create(
+      llvm_visc_x86_launch, ArrayRef<Value *>(LaunchInstArgs, 2),
+      "graph" + Root->getFuncPointer()->getName(), LI);
+  // ReplaceInstWithInst(LI, LaunchInst);
 
   DEBUG(errs() << *LaunchInst << "\n");
   // Replace all wait instructions with x86 specific wait instructions
-  std::vector<IntrinsicInst*>* UseList = getUseList(LI);
-  for(unsigned i=0; i < UseList->size(); ++i) {
-    IntrinsicInst* II = UseList->at(i);
-    CallInst* CI;
-    switch(II->getIntrinsicID()) {
+  std::vector<IntrinsicInst *> *UseList = getUseList(LI);
+  for (unsigned i = 0; i < UseList->size(); ++i) {
+    IntrinsicInst *II = UseList->at(i);
+    CallInst *CI;
+    switch (II->getIntrinsicID()) {
     case Intrinsic::visc_wait:
-      CI = CallInst::Create(llvm_visc_x86_wait,
-                            ArrayRef<Value*>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_x86_wait, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     case Intrinsic::visc_push:
-      CI = CallInst::Create(llvm_visc_bufferPush,
-                            ArrayRef<Value*>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_bufferPush, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     case Intrinsic::visc_pop:
-      CI = CallInst::Create(llvm_visc_bufferPop,
-                            ArrayRef<Value*>(LaunchInst),
+      CI = CallInst::Create(llvm_visc_bufferPop, ArrayRef<Value *>(LaunchInst),
                             "");
       break;
     default:
-      llvm_unreachable("GraphID is used by an instruction other than wait, push, pop");
+      llvm_unreachable(
+          "GraphID is used by an instruction other than wait, push, pop");
     };
     ReplaceInstWithInst(II, CI);
     DEBUG(errs() << *CI << "\n");
   }
-
 }
 
-Value* CGT_X86::getInValueAt(DFNode* Child, unsigned i, Function* ParentF_X86, Instruction* InsertBefore) {
+Value *CGT_X86::getInValueAt(DFNode *Child, unsigned i, Function *ParentF_X86,
+                             Instruction *InsertBefore) {
   // TODO: Assumption is that each input port of a node has just one
   // incoming edge. May change later on.
 
   // Find the incoming edge at the requested input port
-  DFEdge* E = Child->getInDFEdgeAt(i);
+  DFEdge *E = Child->getInDFEdgeAt(i);
   assert(E && "No incoming edge or binding for input element!");
   // Find the Source DFNode associated with the incoming edge
-  DFNode* SrcDF = E->getSourceDF();
+  DFNode *SrcDF = E->getSourceDF();
 
   // If Source DFNode is a dummyNode, edge is from parent. Get the
   // argument from argument list of this internal node
-  Value* inputVal;
-  if(SrcDF->isEntryNode()) {
+  Value *inputVal;
+  if (SrcDF->isEntryNode()) {
     inputVal = getArgumentAt(ParentF_X86, E->getSourcePosition());
-    DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-  }
-  else {
+    DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+  } else {
     // edge is from a sibling
     // Check - code should already be generated for this source dfnode
-    assert(OutputMap.count(SrcDF)
-           && "Source node call not found. Dependency violation!");
+    assert(OutputMap.count(SrcDF) &&
+           "Source node call not found. Dependency violation!");
 
     // Find CallInst associated with the Source DFNode using OutputMap
-    Value* CI = OutputMap[SrcDF];
+    Value *CI = OutputMap[SrcDF];
 
     // Extract element at source position from this call instruction
     std::vector<unsigned> IndexList;
     IndexList.push_back(E->getSourcePosition());
-    DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-    ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                           "", InsertBefore);
+    DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI << "\n");
+    ExtractValueInst *EI =
+        ExtractValueInst::Create(CI, IndexList, "", InsertBefore);
     inputVal = EI;
   }
   return inputVal;
 }
 
-void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
-                              ValueToValueMapTy &VMap,Instruction* IB) {
-  Function* CF = C->getFuncPointer();
+void CGT_X86::invokeChild_X86(DFNode *C, Function *F_X86,
+                              ValueToValueMapTy &VMap, Instruction *IB) {
+  Function *CF = C->getFuncPointer();
 
-//  Function* CF_X86 = C->getGenFunc();
+  //  Function* CF_X86 = C->getGenFunc();
   Function *CF_X86 = C->getGenFuncForTarget(visc::CPU_TARGET);
-  assert(CF_X86 != NULL
-         && "Found leaf node for which code generation has not happened yet!\n");
+  assert(CF_X86 != NULL &&
+         "Found leaf node for which code generation has not happened yet!\n");
   assert(C->hasX86GenFuncForTarget(visc::CPU_TARGET) &&
-         "The generated function to be called from x86 backend is not an x86 function\n");
+         "The generated function to be called from x86 backend is not an x86 "
+         "function\n");
   DEBUG(errs() << "Invoking child node" << CF_X86->getName() << "\n");
 
-  std::vector<Value*> Args;
+  std::vector<Value *> Args;
   // Create argument list to pass to call instruction
   // First find the correct values using the edges
   // The remaing six values are inserted as constants for now.
-  for(unsigned i=0; i<CF->getFunctionType()->getNumParams(); i++) {
+  for (unsigned i = 0; i < CF->getFunctionType()->getNumParams(); i++) {
     Args.push_back(getInValueAt(C, i, F_X86, IB));
   }
 
-  Value* I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
-  for(unsigned j=0; j<6; j++)
+  Value *I64Zero = ConstantInt::get(Type::getInt64Ty(F_X86->getContext()), 0);
+  for (unsigned j = 0; j < 6; j++)
     Args.push_back(I64Zero);
 
   DEBUG(errs() << "Gen Function type: " << *CF_X86->getType() << "\n");
@@ -992,9 +995,8 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
   DEBUG(errs() << "Arguments: " << Args.size() << "\n");
 
   // Call the F_X86 function associated with this node
-  CallInst* CI = CallInst::Create(CF_X86, Args,
-                                  CF_X86->getName()+"_output",
-                                  IB);
+  CallInst *CI =
+      CallInst::Create(CF_X86, Args, CF_X86->getName() + "_output", IB);
   DEBUG(errs() << *CI << "\n");
   OutputMap[C] = CI;
 
@@ -1002,55 +1004,56 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
   // Based on number of dimensions, insert loop instructions
   std::string varNames[3] = {"x", "y", "z"};
   unsigned numArgs = CI->getNumArgOperands();
-  for(unsigned j=0; j < C->getNumOfDim(); j++) {
-    Value* indexLimit = NULL;
+  for (unsigned j = 0; j < C->getNumOfDim(); j++) {
+    Value *indexLimit = NULL;
     // Limit can either be a constant or an arguement of the internal node.
     // In case of constant we can use that constant value directly in the
     // new F_X86 function. In case of an argument, we need to get the mapped
     // value using VMap
-    if(isa<Constant>(C->getDimLimits()[j])) {
+    if (isa<Constant>(C->getDimLimits()[j])) {
       indexLimit = C->getDimLimits()[j];
       DEBUG(errs() << "In Constant case:\n"
-             << "  indexLimit type = " << *indexLimit->getType() << "\n");
-    }
-    else {
+                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
+    } else {
       indexLimit = VMap[C->getDimLimits()[j]];
       DEBUG(errs() << "In VMap case:"
-             <<"  indexLimit type = " << *indexLimit->getType() << "\n");
+                   << "  indexLimit type = " << *indexLimit->getType() << "\n");
     }
     assert(indexLimit && "Invalid dimension limit!");
     // Insert loop
-    Value* indexVar = addLoop(CI, indexLimit, varNames[j]);
+    Value *indexVar = addLoop(CI, indexLimit, varNames[j]);
     DEBUG(errs() << "indexVar type = " << *indexVar->getType() << "\n");
     // Insert index variable and limit arguments
-    CI->setArgOperand(numArgs-6+j, indexVar);
-    CI->setArgOperand(numArgs-3+j, indexLimit);
+    CI->setArgOperand(numArgs - 6 + j, indexVar);
+    CI->setArgOperand(numArgs - 3 + j, indexLimit);
   }
   // Insert call to runtime to push the dim limits and instanceID on the depth
   // stack
-  Value* args[] = {
-    ConstantInt::get(Type::getInt32Ty(CI->getContext()), C->getNumOfDim()), // numDim
-    CI->getArgOperand(numArgs-3+0), // limitX
-    CI->getArgOperand(numArgs-6+0), // iX
-    CI->getArgOperand(numArgs-3+1), // limitY
-    CI->getArgOperand(numArgs-6+1), // iY
-    CI->getArgOperand(numArgs-3+2), // limitZ
-    CI->getArgOperand(numArgs-6+2)  // iZ
+  Value *args[] = {
+      ConstantInt::get(Type::getInt32Ty(CI->getContext()),
+                       C->getNumOfDim()), // numDim
+      CI->getArgOperand(numArgs - 3 + 0), // limitX
+      CI->getArgOperand(numArgs - 6 + 0), // iX
+      CI->getArgOperand(numArgs - 3 + 1), // limitY
+      CI->getArgOperand(numArgs - 6 + 1), // iY
+      CI->getArgOperand(numArgs - 3 + 2), // limitZ
+      CI->getArgOperand(numArgs - 6 + 2)  // iZ
   };
 
-  CallInst* Push = CallInst::Create(llvm_visc_x86_dstack_push, ArrayRef<Value*>(args, 7), "", CI);
+  CallInst *Push = CallInst::Create(llvm_visc_x86_dstack_push,
+                                    ArrayRef<Value *>(args, 7), "", CI);
   DEBUG(errs() << "Push on stack: " << *Push << "\n");
   // Insert call to runtime to pop the dim limits and instanceID from the depth
   // stack
   BasicBlock::iterator i(CI);
   ++i;
-  Instruction* NextI = &*i;
+  Instruction *NextI = &*i;
   // Next Instruction should also belong to the same basic block as the basic
   // block will have a terminator instruction
-  assert(NextI->getParent() == CI->getParent()
-         && "Next Instruction should also belong to the same basic block!");
+  assert(NextI->getParent() == CI->getParent() &&
+         "Next Instruction should also belong to the same basic block!");
 
-  CallInst* Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
+  CallInst *Pop = CallInst::Create(llvm_visc_x86_dstack_pop, None, "", NextI);
   DEBUG(errs() << "Pop from stack: " << *Pop << "\n");
   DEBUG(errs() << *CI->getParent()->getParent());
 }
@@ -1071,34 +1074,33 @@ void CGT_X86::invokeChild_X86(DFNode* C, Function* F_X86,
 // Add runtime API calls to push output for each of the streaming outputs
 // Add loop around the basic block, which exits the loop if isLastInput is false
 
-Function* CGT_X86::createFunctionFilter(DFNode* C) {
-  DEBUG(errs() << "*********Creating Function filter for " << C->getFuncPointer()->getName() << "*****\n");
+Function *CGT_X86::createFunctionFilter(DFNode *C) {
+  DEBUG(errs() << "*********Creating Function filter for "
+               << C->getFuncPointer()->getName() << "*****\n");
 
   /* Create a function with same argument list as child.*/
   DEBUG(errs() << "\tCreate a function with the same argument list as child\n");
   // Get the generated function for child node
-  Function* CF = C->getFuncPointer();
+  Function *CF = C->getFuncPointer();
   // Create Filter Function of type i8*(i8*) which calls the root function
-  Type* i8Ty = Type::getInt8Ty(M.getContext());
-  FunctionType* CF_PipelineTy = FunctionType::get(i8Ty->getPointerTo(),
-                                ArrayRef<Type*>(i8Ty->getPointerTo()),
-                                false);
-  Function* CF_Pipeline = Function::Create(CF_PipelineTy,
-                          CF->getLinkage(),
-                          CF->getName()+"_Pipeline",
-                          &M);
+  Type *i8Ty = Type::getInt8Ty(M.getContext());
+  FunctionType *CF_PipelineTy = FunctionType::get(
+      i8Ty->getPointerTo(), ArrayRef<Type *>(i8Ty->getPointerTo()), false);
+  Function *CF_Pipeline = Function::Create(CF_PipelineTy, CF->getLinkage(),
+                                           CF->getName() + "_Pipeline", &M);
   DEBUG(errs() << "Generating Pipline Function\n");
   // Give a name to the argument which is used pass data to this thread
-  Value* data = &*CF_Pipeline->arg_begin();
+  Value *data = &*CF_Pipeline->arg_begin();
   data->setName("data.addr");
   // Create a new basic block
   DEBUG(errs() << "\tCreate new BB and add a return function\n");
   // Add a basic block to this empty function
-  BasicBlock *BB = BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
+  BasicBlock *BB =
+      BasicBlock::Create(CF_Pipeline->getContext(), "entry", CF_Pipeline);
   // Add a return instruction to the basic block
-  ReturnInst* RI = ReturnInst::Create(CF_Pipeline->getContext(),
-                                      UndefValue::get(CF_Pipeline->getReturnType()), BB);
-
+  ReturnInst *RI =
+      ReturnInst::Create(CF_Pipeline->getContext(),
+                         UndefValue::get(CF_Pipeline->getReturnType()), BB);
 
   /* Extract the elements from the aggregate argument to the function.
    * Replace the streaming inputs with i8* types signifying handle to
@@ -1109,25 +1111,24 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   DEBUG(errs() << "\tReplace streaming input arguments with i8* type\n");
   // These Args will be used when passing arguments to the generated function
   // inside loop, and reading outputs as well.
-  std::vector<Value*> Args;
-  std::vector<Type*> TyList;
+  std::vector<Value *> Args;
+  std::vector<Type *> TyList;
   std::vector<std::string> names;
   // Adding inputs
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
-       i != e; ++i) {
-    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
+       ++i) {
+    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
       TyList.push_back(i8Ty->getPointerTo());
-      names.push_back((Twine(i->getName())+"_buffer").str());
-    }
-    else {
+      names.push_back((Twine(i->getName()) + "_buffer").str());
+    } else {
       TyList.push_back(i->getType());
       names.push_back(i->getName());
     }
   }
   // Adding outputs. FIXME: Since we assume all outputs to be streaming edges,
   // because we get there buffer handles
-  StructType* RetTy = cast<StructType>(CF->getReturnType());
-  for (unsigned i=0; i<RetTy->getNumElements(); i++) {
+  StructType *RetTy = cast<StructType>(CF->getReturnType());
+  for (unsigned i = 0; i < RetTy->getNumElements(); i++) {
     TyList.push_back(i8Ty->getPointerTo());
     names.push_back("out");
   }
@@ -1138,64 +1139,52 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
 
   // Extract the inputs, outputs and
   Args = extractElements(data, TyList, names, RI);
-  for(unsigned i=0; i<Args.size(); i++) {
+  for (unsigned i = 0; i < Args.size(); i++) {
     DEBUG(errs() << *Args[i] << "\n");
   }
 
   // Split the Args vector into, input output and isLastInput
   unsigned numInputs = CF->getFunctionType()->getNumParams();
   unsigned numOutputs = RetTy->getNumElements();
-  std::vector<Value*> InputArgs(Args.begin(), Args.begin() + numInputs);
-  std::vector<Value*> OutputArgs(Args.begin() + numInputs, Args.begin() + numInputs + numOutputs);
-  Instruction* isLastInput = cast<Instruction>(Args[Args.size()-1]);
+  std::vector<Value *> InputArgs(Args.begin(), Args.begin() + numInputs);
+  std::vector<Value *> OutputArgs(Args.begin() + numInputs,
+                                  Args.begin() + numInputs + numOutputs);
+  Instruction *isLastInput = cast<Instruction>(Args[Args.size() - 1]);
 
   /* Add runtime API calls to get input for each of the streaming input edges */
-  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the streaming input edges\n");
+  DEBUG(errs() << "\tAdd runtime API calls to get input for each of the "
+                  "streaming input edges\n");
   // First read the termination condition variable islastInput
-  CallInst* isLastInputPop = CallInst::Create(llvm_visc_bufferPop,
-                                        ArrayRef<Value*>(isLastInput),
-                                        "",
-                                        RI);
-
-  CastInst* BI = BitCastInst::CreateIntegerCast(isLastInputPop,
-                 Type::getInt64Ty(CF_Pipeline->getContext()),
-                 false,
-                 "isLastInput",
-                 RI);
+  CallInst *isLastInputPop = CallInst::Create(
+      llvm_visc_bufferPop, ArrayRef<Value *>(isLastInput), "", RI);
+
+  CastInst *BI = BitCastInst::CreateIntegerCast(
+      isLastInputPop, Type::getInt64Ty(CF_Pipeline->getContext()), false,
+      "isLastInput", RI);
   isLastInput = BI;
   // Create a loop termination condition
-  CmpInst* Cond = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE,
-      isLastInput, Constant::getNullValue(Type::getInt64Ty(CF->getContext())), "isLastInputNotZero",
-      RI);
+  CmpInst *Cond = CmpInst::Create(
+      Instruction::ICmp, CmpInst::ICMP_NE, isLastInput,
+      Constant::getNullValue(Type::getInt64Ty(CF->getContext())),
+      "isLastInputNotZero", RI);
 
   // Get input from buffers of all the incoming streaming edges
-  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end();
-       i != e; ++i) {
-    if(C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
-      CallInst* bufferIn = CallInst::Create(llvm_visc_bufferPop,
-                                            ArrayRef<Value*>(InputArgs[i->getArgNo()]),
-                                            "",
-                                            RI);
-      CastInst* BI;
-      if(i->getType()->isPointerTy()) {
-        BI = CastInst::Create(CastInst::IntToPtr,
-                              bufferIn,
-                              i->getType(),
-                              i->getName()+".addr",
-                              RI);
-      }
-      else if(i->getType()->isFloatTy()) {
-        BI = CastInst::CreateFPCast(bufferIn,
-                                    i->getType(),
-                                    i->getName()+".addr",
-                                    RI);
-      }
-      else {
-        BI = CastInst::CreateIntegerCast(bufferIn,
-                                         i->getType(),
-                                         false,
-                                         i->getName()+".addr",
-                                         RI);
+  for (Function::arg_iterator i = CF->arg_begin(), e = CF->arg_end(); i != e;
+       ++i) {
+    if (C->getInDFEdgeAt(i->getArgNo())->isStreamingEdge()) {
+      CallInst *bufferIn =
+          CallInst::Create(llvm_visc_bufferPop,
+                           ArrayRef<Value *>(InputArgs[i->getArgNo()]), "", RI);
+      CastInst *BI;
+      if (i->getType()->isPointerTy()) {
+        BI = CastInst::Create(CastInst::IntToPtr, bufferIn, i->getType(),
+                              i->getName() + ".addr", RI);
+      } else if (i->getType()->isFloatTy()) {
+        BI = CastInst::CreateFPCast(bufferIn, i->getType(),
+                                    i->getName() + ".addr", RI);
+      } else {
+        BI = CastInst::CreateIntegerCast(bufferIn, i->getType(), false,
+                                         i->getName() + ".addr", RI);
       }
       // Replace the argument in Args vector. We would be using the vector as
       // parameters passed to the call
@@ -1204,46 +1193,40 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   }
   /* Add a call to the generated function of the child node */
   DEBUG(errs() << "\tAdd a call to the generated function of the child node\n");
-//  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
-//  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
-//                                  C->getGenFunc()->getName()+".output", RI);
+  //  DEBUG(errs() << "Type: " << *C->getGenFunc()->getType() << "\n");
+  //  CallInst* CI = CallInst::Create(C->getGenFunc(), InputArgs,
+  //                                  C->getGenFunc()->getName()+".output", RI);
   Function *CGenF = C->getGenFuncForTarget(visc::CPU_TARGET);
-  DEBUG(errs() << "Type: "
-               << *CGenF->getType()
-               << "\n");
-  CallInst* CI = CallInst::Create(CGenF,
-                                  InputArgs,
-                                  CGenF->getName()+".output",
-                                  RI);
+  DEBUG(errs() << "Type: " << *CGenF->getType() << "\n");
+  CallInst *CI =
+      CallInst::Create(CGenF, InputArgs, CGenF->getName() + ".output", RI);
 
   /* Add runtime API calls to push output for each of the streaming outputs */
   // FIXME: Assumption
   // All edges between siblings are streaming edges
-  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the streaming outputs\n");
-  for (unsigned i=0; i< numOutputs; i++) {
+  DEBUG(errs() << "\tAdd runtime API calls to push output for each of the "
+                  "streaming outputs\n");
+  for (unsigned i = 0; i < numOutputs; i++) {
     // Extract output
-    ExtractValueInst* EI = ExtractValueInst::Create(CI, ArrayRef<unsigned>(i),
-                           "",RI);
+    ExtractValueInst *EI =
+        ExtractValueInst::Create(CI, ArrayRef<unsigned>(i), "", RI);
     // Convert to i64
-    CastInst* BI;
-    if(EI->getType()->isPointerTy())
-      BI = CastInst::Create(CastInst::PtrToInt,EI,
-                            Type::getInt64Ty(CF_Pipeline->getContext()),
-                            "",
-                            RI);
+    CastInst *BI;
+    if (EI->getType()->isPointerTy())
+      BI =
+          CastInst::Create(CastInst::PtrToInt, EI,
+                           Type::getInt64Ty(CF_Pipeline->getContext()), "", RI);
     else
-      BI = CastInst::CreateIntegerCast(EI, Type::getInt64Ty(CF_Pipeline->getContext()),
-                                       false, "", RI);
+      BI = CastInst::CreateIntegerCast(
+          EI, Type::getInt64Ty(CF_Pipeline->getContext()), false, "", RI);
     // Push to Output buffer
-    Value* bufferOutArgs[] = {OutputArgs[i], BI};
-    CallInst* bufferOut = CallInst::Create(llvm_visc_bufferPush,
-                                           ArrayRef<Value*>(bufferOutArgs, 2),
-                                           "",
-                                           RI);
+    Value *bufferOutArgs[] = {OutputArgs[i], BI};
+    CallInst *bufferOut = CallInst::Create(
+        llvm_visc_bufferPush, ArrayRef<Value *>(bufferOutArgs, 2), "", RI);
   }
 
-  // Add loop around the basic block, which exits the loop if isLastInput is false
-  // Pointers to keep the created loop structure
+  // Add loop around the basic block, which exits the loop if isLastInput is
+  // false Pointers to keep the created loop structure
   BasicBlock *EntryBB, *CondBB, *BodyBB;
   Instruction *CondStartI = cast<Instruction>(isLastInputPop);
   Instruction *BodyStartI = cast<Instruction>(Cond)->getNextNode();
@@ -1258,16 +1241,16 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   // If the node function calls the visc runtime call to get policy, we update
   // it with the counter information. This means we need to pass an additional
   // argument to the generated function, that is the iteration number, and then
-  // use it as an argument to the policy_getVersion call 
+  // use it as an argument to the policy_getVersion call
   if (GetPolicyCI) {
     CntI = addWhileLoopCounter(EntryBB, CondBB, BodyBB);
     assert(CntI && "Counter instruction not found\n");
 
     // Create new function type (with additional argument for iteration number)
     Type *NewRetTy = CGenF->getFunctionType()->getReturnType();
-    std::vector<Type*> NewArgTypes;
+    std::vector<Type *> NewArgTypes;
     for (Function::arg_iterator ai = CGenF->arg_begin(), ae = CGenF->arg_end();
-         ai != ae ; ++ai) {
+         ai != ae; ++ai) {
       NewArgTypes.push_back(ai->getType());
     }
     NewArgTypes.push_back(Type::getInt64Ty(M.getContext()));
@@ -1283,9 +1266,8 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
 
     // Add counter to the actual parameter list, to create the new call
     InputArgs.push_back(CntI);
-    CallInst* newCI = CallInst::Create(NewCGenF,
-                                       InputArgs,
-                                       NewCGenF->getName()+".output");
+    CallInst *newCI =
+        CallInst::Create(NewCGenF, InputArgs, NewCGenF->getName() + ".output");
     ReplaceInstWithInst(CI, newCI);
 
     // Set second operand of the policy_getVersion call to the last function
@@ -1300,19 +1282,19 @@ Function* CGT_X86::createFunctionFilter(DFNode* C) {
   return CF_Pipeline;
 }
 
-void CGT_X86::codeGen(DFInternalNode* N) {
+void CGT_X86::codeGen(DFInternalNode *N) {
   // Check if N is root node and its graph is streaming. We do not do codeGen
   // for Root in such a case
-  if(N->isRoot() && N->isChildGraphStreaming())
+  if (N->isRoot() && N->isChildGraphStreaming())
     return;
 
   // Check if clone already exists. If it does, it means we have visited this
   // function before and nothing else needs to be done for this leaf node.
-//  if(N->getGenFunc() != NULL)
-//    return;
+  //  if(N->getGenFunc() != NULL)
+  //    return;
   if (!preferredTargetIncludes(N, visc::CPU_TARGET)) {
-    DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName() <<
-                    " : skipping it\n");
+    DEBUG(errs() << "No CPU hint for node " << N->getFuncPointer()->getName()
+                 << " : skipping it\n");
     return;
   }
 
@@ -1325,9 +1307,10 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   // Only process if all children have a CPU x86 function
   // Otherwise skip to end
   bool codeGen = true;
-  for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-      ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-    DFNode* C = *ci;
+  for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                  ce = N->getChildGraph()->end();
+       ci != ce; ++ci) {
+    DFNode *C = *ci;
     // Skip dummy node call
     if (C->isDummyNode())
       continue;
@@ -1342,17 +1325,18 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   }
 
   if (codeGen) {
-    Function* F = N->getFuncPointer();
+    Function *F = N->getFuncPointer();
     // Create of clone of F with no instructions. Only the type is the same as F
     // without the extra arguments.
-    Function* F_X86;
-  
+    Function *F_X86;
+
     // Clone the function, if we are seeing this function for the first time. We
     // only need a clone in terms of type.
     ValueToValueMapTy VMap;
-  
+
     // Create new function with the same type
-    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(), F->getName(), &M);
+    F_X86 = Function::Create(F->getFunctionType(), F->getLinkage(),
+                             F->getName(), &M);
 
     // Loop over the arguments, copying the names of arguments over.
     Function::arg_iterator dest_iterator = F_X86->arg_begin();
@@ -1365,19 +1349,19 @@ void CGT_X86::codeGen(DFInternalNode* N) {
 
     // Add a basic block to this empty function
     BasicBlock *BB = BasicBlock::Create(F_X86->getContext(), "entry", F_X86);
-    ReturnInst* RI = ReturnInst::Create(F_X86->getContext(),
-                                        UndefValue::get(F_X86->getReturnType()), BB);
+    ReturnInst *RI = ReturnInst::Create(
+        F_X86->getContext(), UndefValue::get(F_X86->getReturnType()), BB);
 
-    // Add Index and Dim arguments except for the root node and the child graph of
-    // parent node is not streaming
-    if(!N->isRoot() && !N->getParent()->isChildGraphStreaming())
+    // Add Index and Dim arguments except for the root node and the child graph
+    // of parent node is not streaming
+    if (!N->isRoot() && !N->getParent()->isChildGraphStreaming())
       F_X86 = addIdxDimArgs(F_X86);
 
     BB = &*F_X86->begin();
     RI = cast<ReturnInst>(BB->getTerminator());
-  
-    //Add generated function info to DFNode
-//    N->setGenFunc(F_X86, visc::CPU_TARGET);
+
+    // Add generated function info to DFNode
+    //    N->setGenFunc(F_X86, visc::CPU_TARGET);
     N->addGenFunc(F_X86, visc::CPU_TARGET, true);
 
     // Loop over the arguments, to create the VMap.
@@ -1390,59 +1374,59 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     }
 
     // Iterate over children in topological order
-    for(DFGraph::children_iterator ci = N->getChildGraph()->begin(),
-        ce = N->getChildGraph()->end(); ci != ce; ++ci) {
-      DFNode* C = *ci;
+    for (DFGraph::children_iterator ci = N->getChildGraph()->begin(),
+                                    ce = N->getChildGraph()->end();
+         ci != ce; ++ci) {
+      DFNode *C = *ci;
       // Skip dummy node call
       if (C->isDummyNode())
         continue;
-  
+
       // Create calls to CPU function of child node
       invokeChild_X86(C, F_X86, VMap, RI);
-  
     }
- 
+
     DEBUG(errs() << "*** Generating epilogue code for the function****\n");
     // Generate code for output bindings
     // Get Exit node
-    DFNode* C = N->getChildGraph()->getExit();
+    DFNode *C = N->getChildGraph()->getExit();
     // Get OutputType of this node
-    StructType* OutTy = N->getOutputType();
+    StructType *OutTy = N->getOutputType();
     Value *retVal = UndefValue::get(F_X86->getReturnType());
     // Find all the input edges to exit node
-    for (unsigned i=0; i < OutTy->getNumElements(); i++) {
+    for (unsigned i = 0; i < OutTy->getNumElements(); i++) {
       DEBUG(errs() << "Output Edge " << i << "\n");
       // Find the incoming edge at the requested input port
-      DFEdge* E = C->getInDFEdgeAt(i);
-  
+      DFEdge *E = C->getInDFEdgeAt(i);
+
       assert(E && "No Binding for output element!");
       // Find the Source DFNode associated with the incoming edge
-      DFNode* SrcDF = E->getSourceDF();
-  
-      DEBUG(errs() << "Edge source -- " <<  SrcDF->getFuncPointer()->getName() << "\n");
-  
+      DFNode *SrcDF = E->getSourceDF();
+
+      DEBUG(errs() << "Edge source -- " << SrcDF->getFuncPointer()->getName()
+                   << "\n");
+
       // If Source DFNode is a dummyNode, edge is from parent. Get the
       // argument from argument list of this internal node
-      Value* inputVal;
-      if(SrcDF->isEntryNode()) {
+      Value *inputVal;
+      if (SrcDF->isEntryNode()) {
         inputVal = getArgumentAt(F_X86, i);
-        DEBUG(errs() << "Argument "<< i<< " = "  << *inputVal << "\n");
-      }
-      else {
+        DEBUG(errs() << "Argument " << i << " = " << *inputVal << "\n");
+      } else {
         // edge is from a internal node
         // Check - code should already be generated for this source dfnode
-        assert(OutputMap.count(SrcDF)
-               && "Source node call not found. Dependency violation!");
-  
+        assert(OutputMap.count(SrcDF) &&
+               "Source node call not found. Dependency violation!");
+
         // Find Output Value associated with the Source DFNode using OutputMap
-        Value* CI = OutputMap[SrcDF];
-  
+        Value *CI = OutputMap[SrcDF];
+
         // Extract element at source position from this call instruction
         std::vector<unsigned> IndexList;
         IndexList.push_back(E->getSourcePosition());
-        DEBUG(errs() << "Going to generate ExtarctVal inst from "<< *CI <<"\n");
-        ExtractValueInst* EI = ExtractValueInst::Create(CI, IndexList,
-                               "",RI);
+        DEBUG(errs() << "Going to generate ExtarctVal inst from " << *CI
+                     << "\n");
+        ExtractValueInst *EI = ExtractValueInst::Create(CI, IndexList, "", RI);
         inputVal = EI;
       }
       std::vector<unsigned> IdxList;
@@ -1451,9 +1435,8 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     }
     DEBUG(errs() << "Extracted all\n");
     retVal->setName("output");
-    ReturnInst* newRI = ReturnInst::Create(F_X86->getContext(), retVal);
+    ReturnInst *newRI = ReturnInst::Create(F_X86->getContext(), retVal);
     ReplaceInstWithInst(RI, newRI);
-
   }
 
   //-------------------------------------------------------------------------//
@@ -1470,11 +1453,11 @@ void CGT_X86::codeGen(DFInternalNode* N) {
   bool CFx86 = N->hasX86GenFuncForTarget(visc::CPU_TARGET);
   bool GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
 
-  DEBUG(errs() << "Node: " << N->getFuncPointer()->getName()
-                           << " with tag " << N->getTag() << "\n");
-  DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n");
+  DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
+               << N->getTag() << "\n");
+  DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
   DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
-  DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n");
+  DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
   DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
 
   if (N->getTag() == visc::None) {
@@ -1482,7 +1465,7 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     // node is a node that
     // - from the accelerator backends has been mapped to an intermediate
     // node, and thus they have not produced a genFunc
-    // - a child node had no CPU hint, thus no code gen for CPU could 
+    // - a child node had no CPU hint, thus no code gen for CPU could
     // take place
     DEBUG(errs() << "No GenFunc - Skipping CPU code generation for node "
                  << N->getFuncPointer()->getName() << "\n");
@@ -1493,34 +1476,34 @@ void CGT_X86::codeGen(DFInternalNode* N) {
 
     // Sanity check - to be removed TODO
     switch (N->getTag()) {
-      case visc::CPU_TARGET:
-        assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
-        assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
-        break;
-      case visc::GPU_TARGET:
-        assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
-        assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
-        assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
-        break;
-      default:
-        assert(false && "Unreachable: we checked that tag was single target!\n");
-        break;
+    case visc::CPU_TARGET:
+      assert(N->getGenFuncForTarget(visc::CPU_TARGET) && "");
+      assert(N->hasX86GenFuncForTarget(visc::CPU_TARGET) && "");
+      assert(!(N->getGenFuncForTarget(visc::GPU_TARGET)) && "");
+      assert(!(N->hasX86GenFuncForTarget(visc::GPU_TARGET)) && "");
+      break;
+    case visc::GPU_TARGET:
+      assert(!(N->getGenFuncForTarget(visc::CPU_TARGET)) && "");
+      assert(!(N->hasX86GenFuncForTarget(visc::CPU_TARGET)) && "");
+      assert(N->getGenFuncForTarget(visc::GPU_TARGET) && "");
+      assert(N->hasX86GenFuncForTarget(visc::GPU_TARGET) && "");
+      break;
+    default:
+      assert(false && "Unreachable: we checked that tag was single target!\n");
+      break;
     }
 
-    // If device abstraction is enabled, then we may need to edit the node 
+    // If device abstraction is enabled, then we may need to edit the node
     // function. In case this is a GPU or SPIR gen func, we issue a call to
     // the runtime that waits for the device to be available
     if (DeviceAbstraction) {
       Function *NodeGenFunc = NULL;
       switch (N->getTag()) {
-        case visc::GPU_TARGET:
-          NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
-          break;
-        default:
-          break;
+      case visc::GPU_TARGET:
+        NodeGenFunc = N->getGenFuncForTarget(visc::GPU_TARGET);
+        break;
+      default:
+        break;
       }
 
       if (NodeGenFunc) {
@@ -1528,12 +1511,14 @@ void CGT_X86::codeGen(DFInternalNode* N) {
         // its first statement
         BasicBlock *BB = &*NodeGenFunc->begin();
         std::vector<Value *> Args; // TODO: add the device type as argument?
-        FunctionCallee RTF = 
-           M.getOrInsertFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus", 
-           runtimeModule->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")->getFunctionType());
-	CallInst *RTFInst = CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
+        FunctionCallee RTF = M.getOrInsertFunction(
+            "llvm_visc_deviceAbstraction_waitOnDeviceStatus",
+            runtimeModule
+                ->getFunction("llvm_visc_deviceAbstraction_waitOnDeviceStatus")
+                ->getFunctionType());
+        CallInst *RTFInst =
+            CallInst::Create(RTF, Args, "", BB->getFirstNonPHI());
       }
-
     }
 
     Function *Ftmp = N->getGenFuncForTarget(N->getTag());
@@ -1550,11 +1535,11 @@ void CGT_X86::codeGen(DFInternalNode* N) {
     GFx86 = N->hasX86GenFuncForTarget(visc::GPU_TARGET);
 
     DEBUG(errs() << "After editing\n");
-    DEBUG(errs() << "Node: " << N->getFuncPointer()->getName()
-                             << " with tag " << N->getTag() << "\n");
-    DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null" ) << "\n");
+    DEBUG(errs() << "Node: " << N->getFuncPointer()->getName() << " with tag "
+                 << N->getTag() << "\n");
+    DEBUG(errs() << "CPU Fun: " << (CF ? CF->getName() : "null") << "\n");
     DEBUG(errs() << "hasx86GenFuncForCPU : " << CFx86 << "\n");
-    DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null" ) << "\n");
+    DEBUG(errs() << "GPU Fun: " << (GF ? GF->getName() : "null") << "\n");
     DEBUG(errs() << "hasx86GenFuncForGPU : " << GFx86 << "\n");
 <<<<<<< HEAD
 =======
@@ -2015,4 +2000,3 @@ static RegisterPass<DFG2LLVM_X86> X("dfg2llvm-x86",
                                     "Dataflow Graph to LLVM for X86 backend",
                                     false /* does not modify the CFG */,
                                     true /* transformation, not just analysis */);
-
diff --git a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
index 5e0d4df006cb89414d15991e99f29332f6329b99..799de784c8c4a675927fbb5a0e63ea30b668d738 100644
--- a/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
+++ b/hpvm/lib/Transforms/GenVISC/GenVISC.cpp
@@ -10,112 +10,118 @@
 #define DEBUG_TYPE "genvisc"
 #include "GenVISC/GenVISC.h"
 
+#include "SupportVISC/VISCHint.h"
+#include "SupportVISC/VISCUtils.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "SupportVISC/VISCHint.h"
-#include "SupportVISC/VISCUtils.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "SupportVISC/VISCUtils.h"
-
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
-#define TIMER(X) do { if (VISCTimer) { X; } } while (0)
+#define TIMER(X)                                                               \
+  do {                                                                         \
+    if (VISCTimer) {                                                           \
+      X;                                                                       \
+    }                                                                          \
+  } while (0)
 
 using namespace llvm;
 using namespace viscUtils;
 
-
 // VISC Command line option to use timer or not
-static cl::opt<bool>
-VISCTimer("visc-timers-gen", cl::desc("Enable GenVISC timer"));
+static cl::opt<bool> VISCTimer("visc-timers-gen",
+                               cl::desc("Enable GenVISC timer"));
 
 namespace genvisc {
 
 // Helper Functions
 
-static inline ConstantInt* getTimerID(Module&, enum visc_TimerID);
-static Function* transformReturnTypeToStruct(Function* F);
-static Type* getReturnTypeFromReturnInst(Function* F);
+static inline ConstantInt *getTimerID(Module &, enum visc_TimerID);
+static Function *transformReturnTypeToStruct(Function *F);
+static Type *getReturnTypeFromReturnInst(Function *F);
 
 // Check if the dummy function call is a __visc__node call
-#define IS_VISC_CALL(callName) \
-  static bool isVISCCall_##callName(Instruction* I) { \
-    if(!isa<CallInst>(I)) \
-      return false; \
-    CallInst* CI = cast<CallInst>(I); \
-    return (CI->getCalledValue()->stripPointerCasts()->getName()).equals("__visc__"#callName); \
+#define IS_VISC_CALL(callName)                                                 \
+  static bool isVISCCall_##callName(Instruction *I) {                          \
+    if (!isa<CallInst>(I))                                                     \
+      return false;                                                            \
+    CallInst *CI = cast<CallInst>(I);                                          \
+    return (CI->getCalledValue()->stripPointerCasts()->getName())              \
+        .equals("__visc__" #callName);                                         \
   }
 
-static void ReplaceCallWithIntrinsic(Instruction* I, Intrinsic::ID IntrinsicID, std::vector<Instruction*>* Erase) {
+static void ReplaceCallWithIntrinsic(Instruction *I, Intrinsic::ID IntrinsicID,
+                                     std::vector<Instruction *> *Erase) {
   // Check if the instruction is Call Instruction
   assert(isa<CallInst>(I) && "Expecting CallInst");
-  CallInst* CI = cast<CallInst>(I);
+  CallInst *CI = cast<CallInst>(I);
   DEBUG(errs() << "Found call: " << *CI << "\n");
 
   // Find the correct intrinsic call
-  Module* M = CI->getParent()->getParent()->getParent();
-  Function* F;
-  std::vector<Type*> ArgTypes;
-  std::vector<Value*> args;
-  if(Intrinsic::isOverloaded(IntrinsicID)) {
+  Module *M = CI->getParent()->getParent()->getParent();
+  Function *F;
+  std::vector<Type *> ArgTypes;
+  std::vector<Value *> args;
+  if (Intrinsic::isOverloaded(IntrinsicID)) {
     // This is an overloaded intrinsic. The types must exactly match. Get the
     // argument types
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
       ArgTypes.push_back(CI->getArgOperand(i)->getType());
       args.push_back(CI->getArgOperand(i));
     }
     F = Intrinsic::getDeclaration(M, IntrinsicID, ArgTypes);
     DEBUG(errs() << *F << "\n");
-  }
-  else { // Non-overloaded intrinsic
+  } else { // Non-overloaded intrinsic
     F = Intrinsic::getDeclaration(M, IntrinsicID);
-    FunctionType* FTy = F->getFunctionType();
+    FunctionType *FTy = F->getFunctionType();
     DEBUG(errs() << *F << "\n");
 
     // Create argument list
-    assert(CI->getNumArgOperands() == FTy->getNumParams()
-        && "Number of arguments of call do not match with Intrinsic");
-    for(unsigned i=0; i < CI->getNumArgOperands(); i++) {
-      Value* V = CI->getArgOperand(i);
+    assert(CI->getNumArgOperands() == FTy->getNumParams() &&
+           "Number of arguments of call do not match with Intrinsic");
+    for (unsigned i = 0; i < CI->getNumArgOperands(); i++) {
+      Value *V = CI->getArgOperand(i);
       // Either the type should match or both should be of pointer type
       assert((V->getType() == FTy->getParamType(i) ||
-          (V->getType()->isPointerTy() && FTy->getParamType(i)->isPointerTy()))
-          && "Dummy function call argument does not match with Intrinsic argument!");
+              (V->getType()->isPointerTy() &&
+               FTy->getParamType(i)->isPointerTy())) &&
+             "Dummy function call argument does not match with Intrinsic "
+             "argument!");
       // If the types do not match, then both must be pointer type and pointer
       // cast needs to be performed
-      if(V->getType() != FTy->getParamType(i)) {
+      if (V->getType() != FTy->getParamType(i)) {
         V = CastInst::CreatePointerCast(V, FTy->getParamType(i), "", CI);
       }
       args.push_back(V);
     }
   }
   // Insert call instruction
-  CallInst* Inst = CallInst::Create(F, args, F->getReturnType()->isVoidTy()? "" : CI->getName(), CI);
+  CallInst *Inst = CallInst::Create(
+      F, args, F->getReturnType()->isVoidTy() ? "" : CI->getName(), CI);
 
   DEBUG(errs() << "\tSubstitute with: " << *Inst << "\n");
 
   CI->replaceAllUsesWith(Inst);
   // If the previous instruction needs to be erased, insert it in the vector
   // Erased
-  if(Erase != NULL)
+  if (Erase != NULL)
     Erase->push_back(CI);
 }
 
 IS_VISC_CALL(launch) /* Exists but not required */
-IS_VISC_CALL(edge) /* Exists but not required */
+IS_VISC_CALL(edge)   /* Exists but not required */
 IS_VISC_CALL(createNodeND)
-//IS_VISC_CALL(createNode)
-//IS_VISC_CALL(createNode1D)
-//IS_VISC_CALL(createNode2D)
-//IS_VISC_CALL(createNode3D)
+// IS_VISC_CALL(createNode)
+// IS_VISC_CALL(createNode1D)
+// IS_VISC_CALL(createNode2D)
+// IS_VISC_CALL(createNode3D)
 IS_VISC_CALL(bindIn)
 IS_VISC_CALL(bindOut)
 IS_VISC_CALL(push)
@@ -124,7 +130,7 @@ IS_VISC_CALL(getNode)
 IS_VISC_CALL(getParentNode)
 IS_VISC_CALL(barrier)
 IS_VISC_CALL(malloc)
-IS_VISC_CALL(return)
+IS_VISC_CALL(return )
 IS_VISC_CALL(getNodeInstanceID_x)
 IS_VISC_CALL(getNodeInstanceID_y)
 IS_VISC_CALL(getNodeInstanceID_z)
@@ -152,7 +158,6 @@ IS_VISC_CALL(sqrt)
 IS_VISC_CALL(sin)
 IS_VISC_CALL(cos)
 
-
 IS_VISC_CALL(init)
 IS_VISC_CALL(cleanup)
 IS_VISC_CALL(wait)
@@ -163,94 +168,91 @@ IS_VISC_CALL(attributes)
 IS_VISC_CALL(hint)
 
 // Return the constant integer represented by value V
-static unsigned getNumericValue(Value* V) {
-  assert(isa<ConstantInt>(V)
-         && "Value indicating the number of arguments should be a constant integer");
+static unsigned getNumericValue(Value *V) {
+  assert(
+      isa<ConstantInt>(V) &&
+      "Value indicating the number of arguments should be a constant integer");
   return cast<ConstantInt>(V)->getZExtValue();
 }
 
 // Take the __visc__return instruction and generate code for combining the
 // values being returned into a struct and returning it.
 // The first operand is the number of returned values
-static Value* genCodeForReturn(CallInst* CI) {
-  LLVMContext& Ctx = CI->getContext();
-  assert(isVISCCall_return(CI)
-      && "__visc__return instruction expected!");
+static Value *genCodeForReturn(CallInst *CI) {
+  LLVMContext &Ctx = CI->getContext();
+  assert(isVISCCall_return(CI) && "__visc__return instruction expected!");
 
   // Parse the dummy function call here
-  assert(CI->getNumArgOperands() > 0 && "Too few arguments for __visc_return call!\n");
+  assert(CI->getNumArgOperands() > 0 &&
+         "Too few arguments for __visc_return call!\n");
   unsigned numRetVals = getNumericValue(CI->getArgOperand(0));
 
-  assert(CI->getNumArgOperands()-1 == numRetVals &&
+  assert(CI->getNumArgOperands() - 1 == numRetVals &&
          "Too few arguments for __visc_return call!\n");
   DEBUG(errs() << "\tNum of return values = " << numRetVals << "\n");
 
-  std::vector<Type*> ArgTypes;
-  for(unsigned i=1; i < CI->getNumArgOperands(); i++) {
+  std::vector<Type *> ArgTypes;
+  for (unsigned i = 1; i < CI->getNumArgOperands(); i++) {
     ArgTypes.push_back(CI->getArgOperand(i)->getType());
   }
   Twine outTyName = "struct.out." + CI->getParent()->getParent()->getName();
-  StructType* RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
+  StructType *RetTy = StructType::create(Ctx, ArgTypes, outTyName.str(), true);
 
-  InsertValueInst* IV = InsertValueInst::Create(UndefValue::get(RetTy),
-                                                CI->getArgOperand(1),
-                                                0,
-                                                "returnStruct",
-                                                CI);
+  InsertValueInst *IV = InsertValueInst::Create(
+      UndefValue::get(RetTy), CI->getArgOperand(1), 0, "returnStruct", CI);
   DEBUG(errs() << "Code generation for return:\n");
   DEBUG(errs() << *IV << "\n");
 
-  for(unsigned i=2; i < CI->getNumArgOperands(); i++) {
-    IV = InsertValueInst::Create(IV,
-                                 CI->getArgOperand(i),
-                                 i-1,
-                                 IV->getName(),
+  for (unsigned i = 2; i < CI->getNumArgOperands(); i++) {
+    IV = InsertValueInst::Create(IV, CI->getArgOperand(i), i - 1, IV->getName(),
                                  CI);
     DEBUG(errs() << *IV << "\n");
   }
-  
+
   return IV;
 }
 
 // Analyse the attribute call for this function. Add the in and out
 // attributes to pointer parameters.
-static void handleVISCAttributes(Function* F, CallInst* CI) {
-  DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n" << *F << "\n");
+static void handleVISCAttributes(Function *F, CallInst *CI) {
+  DEBUG(errs() << "Kernel before adding In/Out VISC attributes:\n"
+               << *F << "\n");
   // Parse the dummy function call here
   unsigned offset = 0;
   // Find number of In pointers
-  assert(CI->getNumArgOperands() > offset
-         && "Too few arguments for __visc__attributes call!");
+  assert(CI->getNumArgOperands() > offset &&
+         "Too few arguments for __visc__attributes call!");
   unsigned numInPtrs = getNumericValue(CI->getArgOperand(offset));
   DEBUG(errs() << "\tNum of in pointers = " << numInPtrs << "\n");
 
-  for(unsigned i = offset+1; i< offset+1+numInPtrs; i++) {
-    Value* V = CI->getArgOperand(i);
-    if(Argument* arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1+arg->getArgNo(), Attribute::In);
-    }
-    else {
+  for (unsigned i = offset + 1; i < offset + 1 + numInPtrs; i++) {
+    Value *V = CI->getArgOperand(i);
+    if (Argument *arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1 + arg->getArgNo(), Attribute::In);
+    } else {
       DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n");
-      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
+      llvm_unreachable(
+          "Only pointer arguments can be passed to __visc__attributes call");
     }
   }
   // Find number of Out Pointers
   offset += 1 + numInPtrs;
-  assert(CI->getNumArgOperands() > offset
-         && "Too few arguments for __visc__attributes call!");
+  assert(CI->getNumArgOperands() > offset &&
+         "Too few arguments for __visc__attributes call!");
   unsigned numOutPtrs = getNumericValue(CI->getOperand(offset));
   DEBUG(errs() << "\tNum of out Pointers = " << numOutPtrs << "\n");
-  for(unsigned i = offset+1; i< offset+1+numOutPtrs; i++) {
-    Value* V = CI->getArgOperand(i);
-    if(Argument* arg = dyn_cast<Argument>(V)) {
-      F->addAttribute(1+arg->getArgNo(), Attribute::Out);
-    }
-    else {
+  for (unsigned i = offset + 1; i < offset + 1 + numOutPtrs; i++) {
+    Value *V = CI->getArgOperand(i);
+    if (Argument *arg = dyn_cast<Argument>(V)) {
+      F->addAttribute(1 + arg->getArgNo(), Attribute::Out);
+    } else {
       DEBUG(errs() << "Invalid argument to __visc__attribute: " << *V << "\n");
-      llvm_unreachable("Only pointer arguments can be passed to __visc__attributes call");
+      llvm_unreachable(
+          "Only pointer arguments can be passed to __visc__attributes call");
     }
   }
-  DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n" << *F << "\n");
+  DEBUG(errs() << "Kernel after adding In/Out VISC attributes:\n"
+               << *F << "\n");
 }
 
 // Public Functions of GenVISC pass
@@ -261,38 +263,42 @@ bool GenVISC::runOnModule(Module &M) {
   // Load Runtime API Module
   SMDiagnostic Err;
 
-  char* LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
-  assert(LLVM_SRC_ROOT != NULL &&
-         "Define LLVM_SRC_ROOT environment variable!");
+  char *LLVM_SRC_ROOT = getenv("LLVM_SRC_ROOT");
+  assert(LLVM_SRC_ROOT != NULL && "Define LLVM_SRC_ROOT environment variable!");
 
   Twine llvmSrcRoot = LLVM_SRC_ROOT;
   Twine runtimeAPI = llvmSrcRoot + "/tools/hpvm/projects/visc-rt/visc-rt.ll";
   DEBUG(errs() << llvmSrcRoot << "\n");
 
-  std::unique_ptr<Module> runtimeModule = parseIRFile(runtimeAPI.str(), Err, M.getContext());
+  std::unique_ptr<Module> runtimeModule =
+      parseIRFile(runtimeAPI.str(), Err, M.getContext());
 
-  if(runtimeModule == NULL)
+  if (runtimeModule == NULL)
     DEBUG(errs() << Err.getMessage());
   else
     DEBUG(errs() << "Successfully loaded visc-rt API module\n");
 
-  llvm_visc_initializeTimerSet = M.getOrInsertFunction("llvm_visc_initializeTimerSet",
-                                 runtimeModule->getFunction("llvm_visc_initializeTimerSet")->getFunctionType());
-  //DEBUG(errs() << *llvm_visc_initializeTimerSet);
+  llvm_visc_initializeTimerSet = M.getOrInsertFunction(
+      "llvm_visc_initializeTimerSet",
+      runtimeModule->getFunction("llvm_visc_initializeTimerSet")
+          ->getFunctionType());
+  // DEBUG(errs() << *llvm_visc_initializeTimerSet);
 
-  llvm_visc_switchToTimer = M.getOrInsertFunction("llvm_visc_switchToTimer",
-                            runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType());
- // DEBUG(errs() << *llvm_visc_switchToTimer);
+  llvm_visc_switchToTimer = M.getOrInsertFunction(
+      "llvm_visc_switchToTimer",
+      runtimeModule->getFunction("llvm_visc_switchToTimer")->getFunctionType());
+  // DEBUG(errs() << *llvm_visc_switchToTimer);
 
-  llvm_visc_printTimerSet = M.getOrInsertFunction("llvm_visc_printTimerSet",
-                            runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType());
-  //DEBUG(errs() << *llvm_visc_printTimerSet);
+  llvm_visc_printTimerSet = M.getOrInsertFunction(
+      "llvm_visc_printTimerSet",
+      runtimeModule->getFunction("llvm_visc_printTimerSet")->getFunctionType());
+  // DEBUG(errs() << *llvm_visc_printTimerSet);
 
   // Insert init context in main
   DEBUG(errs() << "Locate __visc__init()\n");
-  Function* VI = M.getFunction("__visc__init");
+  Function *VI = M.getFunction("__visc__init");
   assert(VI->getNumUses() == 1 && "__visc__init should only be used once");
-  Instruction* I = cast<Instruction>(*VI->user_begin());
+  Instruction *I = cast<Instruction>(*VI->user_begin());
 
   DEBUG(errs() << "Initialize Timer Set\n");
   initializeTimerSet(I);
@@ -300,18 +306,17 @@ bool GenVISC::runOnModule(Module &M) {
 
   // Insert print instruction at visc exit
   DEBUG(errs() << "Locate __visc__cleanup()\n");
-  Function* VC = M.getFunction("__visc__cleanup");
+  Function *VC = M.getFunction("__visc__cleanup");
   assert(VC->getNumUses() == 1 && "__visc__cleanup should only be used once");
   I = cast<Instruction>(*VC->user_begin());
   printTimerSet(I);
 
-
   DEBUG(errs() << "-------- Searching for launch sites ----------\n");
 
-  std::vector<Instruction*> toBeErased;
-  std::vector<Function*> functions;
+  std::vector<Instruction *> toBeErased;
+  std::vector<Function *> functions;
 
-  for (auto &F : M) 
+  for (auto &F : M)
     functions.push_back(&F);
 
   // Iterate over all functions in the module
@@ -319,7 +324,7 @@ bool GenVISC::runOnModule(Module &M) {
     DEBUG(errs() << "Function: " << f->getName() << "\n");
 
     // List with the required additions in the function's return type
-    std::vector<Type*> FRetTypes;
+    std::vector<Type *> FRetTypes;
 
     enum mutateTypeCause {
       mtc_None,
@@ -330,98 +335,106 @@ bool GenVISC::runOnModule(Module &M) {
     bind = mutateTypeCause::mtc_None;
 
     // Iterate over all the instructions in this function
-    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e ; ++i) {
-      Instruction* I = &*i; // Grab pointer to Instruction
+    for (inst_iterator i = inst_begin(f), e = inst_end(f); i != e; ++i) {
+      Instruction *I = &*i; // Grab pointer to Instruction
       // If not a call instruction, move to next instruction
-      if(!isa<CallInst>(I))
+      if (!isa<CallInst>(I))
         continue;
 
-      CallInst* CI = cast<CallInst>(I);
-      LLVMContext& Ctx = CI->getContext();
+      CallInst *CI = cast<CallInst>(I);
+      LLVMContext &Ctx = CI->getContext();
 
-      if(isVISCCall_init(I)) {
+      if (isVISCCall_init(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_init, &toBeErased);
       }
-      if(isVISCCall_cleanup(I)) {
+      if (isVISCCall_cleanup(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_cleanup, &toBeErased);
       }
-      if(isVISCCall_wait(I)) {
+      if (isVISCCall_wait(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_wait, &toBeErased);
       }
-      if(isVISCCall_trackMemory(I)) {
+      if (isVISCCall_trackMemory(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_trackMemory, &toBeErased);
       }
-      if(isVISCCall_untrackMemory(I)) {
+      if (isVISCCall_untrackMemory(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_untrackMemory, &toBeErased);
       }
-      if(isVISCCall_requestMemory(I)) {
+      if (isVISCCall_requestMemory(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_requestMemory, &toBeErased);
       }
-      if(isVISCCall_hint(I)) {
-        assert(isa<ConstantInt>(CI->getArgOperand(0))
-               && "Argument to hint must be constant integer!");
-        ConstantInt* hint = cast<ConstantInt>(CI->getArgOperand(0));
+      if (isVISCCall_hint(I)) {
+        assert(isa<ConstantInt>(CI->getArgOperand(0)) &&
+               "Argument to hint must be constant integer!");
+        ConstantInt *hint = cast<ConstantInt>(CI->getArgOperand(0));
 
-        visc::Target t = (visc::Target) hint->getZExtValue();
+        visc::Target t = (visc::Target)hint->getZExtValue();
         addHint(CI->getParent()->getParent(), t);
         DEBUG(errs() << "Found visc hint call: " << *CI << "\n");
         toBeErased.push_back(CI);
       }
-      if(isVISCCall_launch(I)) {
-        Function* LaunchF = Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
+      if (isVISCCall_launch(I)) {
+        Function *LaunchF =
+            Intrinsic::getDeclaration(&M, Intrinsic::visc_launch);
         DEBUG(errs() << *LaunchF << "\n");
         // Get i8* cast to function pointer
-        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
+        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
         graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
-	assert(F && "Function invoked by VISC launch has to be define and constant.");
-
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(0));
-	assert(Op && "VISC launch's streaming argument is a constant value.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        
+        Constant *F =
+            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+        assert(
+            F &&
+            "Function invoked by VISC launch has to be define and constant.");
+
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(0));
+        assert(Op && "VISC launch's streaming argument is a constant value.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+
         auto *ArgTy = dyn_cast<PointerType>(CI->getArgOperand(2)->getType());
         assert(ArgTy && "VISC launch argument should be pointer type.");
         Value *Arg = CI->getArgOperand(2);
-        if(!ArgTy->getElementType()->isIntegerTy(8))
-          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2), Type::getInt8PtrTy(Ctx), "", CI);
-        Value* LaunchArgs[] = {F, Arg, isStreaming};
-        CallInst* LaunchInst = CallInst::Create(LaunchF,
-                                                ArrayRef<Value*>(LaunchArgs, 3),
-                                                "graphID", CI);
+        if (!ArgTy->getElementType()->isIntegerTy(8))
+          Arg = BitCastInst::CreatePointerCast(CI->getArgOperand(2),
+                                               Type::getInt8PtrTy(Ctx), "", CI);
+        Value *LaunchArgs[] = {F, Arg, isStreaming};
+        CallInst *LaunchInst = CallInst::Create(
+            LaunchF, ArrayRef<Value *>(LaunchArgs, 3), "graphID", CI);
         DEBUG(errs() << "Found visc launch call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *LaunchInst << "\n");
         CI->replaceAllUsesWith(LaunchInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCCall_push(I)) {
+      if (isVISCCall_push(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_push, &toBeErased);
       }
-      if(isVISCCall_pop(I)) {
+      if (isVISCCall_pop(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_pop, &toBeErased);
       }
-      if(isVISCCall_createNodeND(I)) {
+      if (isVISCCall_createNodeND(I)) {
         assert(CI->getNumArgOperands() > 0 &&
                "Too few arguments for __visc__createNodeND call");
         unsigned numDims = getNumericValue(CI->getArgOperand(0));
         // We need as meny dimension argments are there are dimensions
-        assert(CI->getNumArgOperands()-2 == numDims &&
-              "Too few arguments for __visc_createNodeND call!\n");
+        assert(CI->getNumArgOperands() - 2 == numDims &&
+               "Too few arguments for __visc_createNodeND call!\n");
 
-        Function* CreateNodeF;
+        Function *CreateNodeF;
         switch (numDims) {
         case 0:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode);
           break;
         case 1:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode1D);
           break;
         case 2:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode2D);
           break;
         case 3:
-          CreateNodeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
+          CreateNodeF =
+              Intrinsic::getDeclaration(&M, Intrinsic::visc_createNode3D);
           break;
         default:
           llvm_unreachable("Unsupported number of dimensions\n");
@@ -429,63 +442,57 @@ bool GenVISC::runOnModule(Module &M) {
         }
         DEBUG(errs() << *CreateNodeF << "\n");
         DEBUG(errs() << *I << "\n");
-        DEBUG(errs() << "in " << I->getParent()->getParent()->getName() << "\n");
+        DEBUG(errs() << "in " << I->getParent()->getParent()->getName()
+                     << "\n");
 
         // Get i8* cast to function pointer
-        Function* graphFunc = cast<Function>(CI->getArgOperand(1));
+        Function *graphFunc = cast<Function>(CI->getArgOperand(1));
         graphFunc = transformReturnTypeToStruct(graphFunc);
-        Constant* F = ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
+        Constant *F =
+            ConstantExpr::getPointerCast(graphFunc, Type::getInt8PtrTy(Ctx));
 
-        CallInst* CreateNodeInst;
+        CallInst *CreateNodeInst;
         switch (numDims) {
         case 0:
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(F),
-                                            graphFunc->getName()+".node", CI);
+          CreateNodeInst = CallInst::Create(CreateNodeF, ArrayRef<Value *>(F),
+                                            graphFunc->getName() + ".node", CI);
           break;
-        case 1:
-          {
+        case 1: {
           assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 2, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F, CI->getArgOperand(2)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 2),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        case 2:
-          {
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2)};
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 2),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        case 2: {
           assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 2, expected to be i64\n");
           assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 3, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F,
-                                     CI->getArgOperand(2),
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
                                      CI->getArgOperand(3)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 3),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
-        case 3:
-          {
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 3),
+              graphFunc->getName() + ".node", CI);
+        } break;
+        case 3: {
           assert((CI->getArgOperand(2)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 2, expected to be i64\n");
           assert((CI->getArgOperand(3)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 3, expected to be i64\n");
           assert((CI->getArgOperand(4)->getType() == Type::getInt64Ty(Ctx)) &&
                  "CreateNodeND dimension argument, 4, expected to be i64\n");
-          Value* CreateNodeArgs[] = {F,
-                                     CI->getArgOperand(2),
+          Value *CreateNodeArgs[] = {F, CI->getArgOperand(2),
                                      CI->getArgOperand(3),
                                      CI->getArgOperand(4)};
-          CreateNodeInst = CallInst::Create(CreateNodeF,
-                                            ArrayRef<Value*>(CreateNodeArgs, 4),
-                                            graphFunc->getName()+".node", CI);
-          }
-          break;
+          CreateNodeInst = CallInst::Create(
+              CreateNodeF, ArrayRef<Value *>(CreateNodeArgs, 4),
+              graphFunc->getName() + ".node", CI);
+        } break;
         default:
-          llvm_unreachable("Impossible path: number of dimensions is 0, 1, 2, 3\n");
+          llvm_unreachable(
+              "Impossible path: number of dimensions is 0, 1, 2, 3\n");
           break;
         }
 
@@ -495,99 +502,104 @@ bool GenVISC::runOnModule(Module &M) {
         toBeErased.push_back(CI);
       }
 
-      if(isVISCCall_edge(I)) {
-        Function* EdgeF = Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
+      if (isVISCCall_edge(I)) {
+        Function *EdgeF =
+            Intrinsic::getDeclaration(&M, Intrinsic::visc_createEdge);
         DEBUG(errs() << *EdgeF << "\n");
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(5));
-        ConstantInt* EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
-	assert(Op && EdgeTypeOp && "Arguments of CreateEdge are not constant integers.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* isAllToAll = EdgeTypeOp->isZero()? ConstantInt::getFalse(Ctx)
-                                                : ConstantInt::getTrue(Ctx);
-        Value* EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                             isAllToAll, CI->getArgOperand(3), CI->getArgOperand(4),
-                             isStreaming
-                            };
-        CallInst* EdgeInst = CallInst::Create(EdgeF,
-                                              ArrayRef<Value*>(EdgeArgs, 6),
-                                              "output", CI);
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(5));
+        ConstantInt *EdgeTypeOp = cast<ConstantInt>(CI->getArgOperand(2));
+        assert(Op && EdgeTypeOp &&
+               "Arguments of CreateEdge are not constant integers.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *isAllToAll = EdgeTypeOp->isZero() ? ConstantInt::getFalse(Ctx)
+                                                 : ConstantInt::getTrue(Ctx);
+        Value *EdgeArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                             isAllToAll,           CI->getArgOperand(3),
+                             CI->getArgOperand(4), isStreaming};
+        CallInst *EdgeInst = CallInst::Create(
+            EdgeF, ArrayRef<Value *>(EdgeArgs, 6), "output", CI);
         DEBUG(errs() << "Found visc edge call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *EdgeInst << "\n");
         CI->replaceAllUsesWith(EdgeInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCCall_bindIn(I)) {
-        Function* BindInF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
+      if (isVISCCall_bindIn(I)) {
+        Function *BindInF =
+            Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_input);
         DEBUG(errs() << *BindInF << "\n");
         // Check if this is a streaming bind or not
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
-	assert(Op && "Streaming argument for bind in intrinsic should be a constant integer.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                               CI->getArgOperand(2), isStreaming
-                              };
-        CallInst* BindInInst = CallInst::Create(BindInF,
-                                                ArrayRef<Value*>(BindInArgs, 4),
-                                                "", CI);
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
+        assert(Op && "Streaming argument for bind in intrinsic should be a "
+                     "constant integer.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *BindInArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), isStreaming};
+        CallInst *BindInInst =
+            CallInst::Create(BindInF, ArrayRef<Value *>(BindInArgs, 4), "", CI);
         DEBUG(errs() << "Found visc bindIn call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *BindInInst << "\n");
         CI->replaceAllUsesWith(BindInInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCCall_bindOut(I)) {
-        Function* BindOutF = Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
+      if (isVISCCall_bindOut(I)) {
+        Function *BindOutF =
+            Intrinsic::getDeclaration(&M, Intrinsic::visc_bind_output);
         DEBUG(errs() << *BindOutF << "\n");
         // Check if this is a streaming bind or not
-        ConstantInt* Op = cast<ConstantInt>(CI->getArgOperand(3));
-	assert(Op && "Streaming argument for bind out intrinsic should be a constant integer.");
-        Value* isStreaming = Op->isZero()? ConstantInt::getFalse(Ctx)
-                             : ConstantInt::getTrue(Ctx);
-        Value* BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
-                                CI->getArgOperand(2), isStreaming
-                               };
-        CallInst* BindOutInst = CallInst::Create(BindOutF,
-                                ArrayRef<Value*>(BindOutArgs, 4),
-                                "", CI);
+        ConstantInt *Op = cast<ConstantInt>(CI->getArgOperand(3));
+        assert(Op && "Streaming argument for bind out intrinsic should be a "
+                     "constant integer.");
+        Value *isStreaming = Op->isZero() ? ConstantInt::getFalse(Ctx)
+                                          : ConstantInt::getTrue(Ctx);
+        Value *BindOutArgs[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                                CI->getArgOperand(2), isStreaming};
+        CallInst *BindOutInst = CallInst::Create(
+            BindOutF, ArrayRef<Value *>(BindOutArgs, 4), "", CI);
         DEBUG(errs() << "Found visc bindOut call: " << *CI << "\n");
         DEBUG(errs() << "\tSubstitute with: " << *BindOutInst << "\n");
 
         DEBUG(errs() << "Fixing the return type of the function\n");
         // FIXME: What if the child node function has not been visited already.
         // i.e., it's return type has not been fixed.
-        Function* F = I->getParent()->getParent();
+        Function *F = I->getParent()->getParent();
         DEBUG(errs() << F->getName() << "\n";);
-        IntrinsicInst* NodeIntrinsic = cast<IntrinsicInst>(CI->getArgOperand(0));
-	assert(NodeIntrinsic && "Instruction value in bind out is not a create node intrinsic.");
+        IntrinsicInst *NodeIntrinsic =
+            cast<IntrinsicInst>(CI->getArgOperand(0));
+        assert(NodeIntrinsic &&
+               "Instruction value in bind out is not a create node intrinsic.");
         DEBUG(errs() << "Node intrinsic: " << *NodeIntrinsic << "\n");
-	assert((NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D ||
-		NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) &&
-		"Instruction value in bind out is not a create node intrinsic.");
-        Function* ChildF = cast<Function>(NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
+        assert(
+            (NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode1D ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode2D ||
+             NodeIntrinsic->getIntrinsicID() == Intrinsic::visc_createNode3D) &&
+            "Instruction value in bind out is not a create node intrinsic.");
+        Function *ChildF = cast<Function>(
+            NodeIntrinsic->getArgOperand(0)->stripPointerCasts());
         DEBUG(errs() << ChildF->getName() << "\n";);
         int srcpos = cast<ConstantInt>(CI->getArgOperand(1))->getSExtValue();
         int destpos = cast<ConstantInt>(CI->getArgOperand(2))->getSExtValue();
-        StructType* ChildReturnTy = cast<StructType>(ChildF->getReturnType());
+        StructType *ChildReturnTy = cast<StructType>(ChildF->getReturnType());
 
-        Type* ReturnType = F->getReturnType();
+        Type *ReturnType = F->getReturnType();
         DEBUG(errs() << *ReturnType << "\n";);
-        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType))
-            && "Return type should either be a struct or void type!");
+        assert((ReturnType->isVoidTy() || isa<StructType>(ReturnType)) &&
+               "Return type should either be a struct or void type!");
 
-        FRetTypes.insert(FRetTypes.begin()+destpos, ChildReturnTy->getElementType(srcpos));
+        FRetTypes.insert(FRetTypes.begin() + destpos,
+                         ChildReturnTy->getElementType(srcpos));
         assert(((bind == mutateTypeCause::mtc_BIND) ||
                 (bind == mutateTypeCause::mtc_None)) &&
-                "Both bind_out and visc_return detected");
+               "Both bind_out and visc_return detected");
         bind = mutateTypeCause::mtc_BIND;
 
         CI->replaceAllUsesWith(BindOutInst);
         toBeErased.push_back(CI);
       }
-      if(isVISCCall_attributes(I)) {
-        Function* F = CI->getParent()->getParent();
+      if (isVISCCall_attributes(I)) {
+        Function *F = CI->getParent()->getParent();
         handleVISCAttributes(F, CI);
         toBeErased.push_back(CI);
       }
@@ -604,67 +616,76 @@ bool GenVISC::runOnModule(Module &M) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_malloc, &toBeErased);
       }
       if (isVISCCall_return(I)) {
-        DEBUG(errs() << "Function before visc return processing\n" << *I->getParent()->getParent() << "\n");
+        DEBUG(errs() << "Function before visc return processing\n"
+                     << *I->getParent()->getParent() << "\n");
         // The operands to this call are the values to be returned by the node
-        Value* ReturnVal = genCodeForReturn(CI);
+        Value *ReturnVal = genCodeForReturn(CI);
         DEBUG(errs() << *ReturnVal << "\n");
-        Type* ReturnType = ReturnVal->getType();
-        assert(isa<StructType>(ReturnType)
-               && "Return type should be a struct type!");
+        Type *ReturnType = ReturnVal->getType();
+        assert(isa<StructType>(ReturnType) &&
+               "Return type should be a struct type!");
 
         assert(((bind == mutateTypeCause::mtc_RETURN) ||
                 (bind == mutateTypeCause::mtc_None)) &&
-                "Both bind_out and visc_return detected");
+               "Both bind_out and visc_return detected");
 
         if (bind == mutateTypeCause::mtc_None) {
           // If this is None, this is the first __visc__return
           // instruction we have come upon. Place the return type of the
           // function in the return type vector
           bind = mutateTypeCause::mtc_RETURN;
-          StructType* ReturnStructTy = cast<StructType>(ReturnType);
+          StructType *ReturnStructTy = cast<StructType>(ReturnType);
           for (unsigned i = 0; i < ReturnStructTy->getNumElements(); i++)
             FRetTypes.push_back(ReturnStructTy->getElementType(i));
         } else { // bind == mutateTypeCause::mtc_RETURN
           // This is not the first __visc__return
-          // instruction we have come upon. 
+          // instruction we have come upon.
           // Check that the return types are the same
-          assert((ReturnType == FRetTypes[0])
-                 && "Multiple returns with mismatching types");
+          assert((ReturnType == FRetTypes[0]) &&
+                 "Multiple returns with mismatching types");
         }
 
-        ReturnInst* RetInst = ReturnInst::Create(Ctx, ReturnVal);
+        ReturnInst *RetInst = ReturnInst::Create(Ctx, ReturnVal);
         DEBUG(errs() << "Found visc return call: " << *CI << "\n");
-        Instruction* oldReturn = CI->getParent()->getTerminator();
-        assert(isa<ReturnInst>(oldReturn)
-                && "Expecting a return to be the terminator of this BB!");
+        Instruction *oldReturn = CI->getParent()->getTerminator();
+        assert(isa<ReturnInst>(oldReturn) &&
+               "Expecting a return to be the terminator of this BB!");
         DEBUG(errs() << "Found return statement of BB: " << *oldReturn << "\n");
         DEBUG(errs() << "\tSubstitute return with: " << *RetInst << "\n");
-        //CI->replaceAllUsesWith(RetInst);
+        // CI->replaceAllUsesWith(RetInst);
         toBeErased.push_back(CI);
         ReplaceInstWithInst(oldReturn, RetInst);
-        DEBUG(errs() << "Function after visc return processing\n" << *I->getParent()->getParent() << "\n");
+        DEBUG(errs() << "Function after visc return processing\n"
+                     << *I->getParent()->getParent() << "\n");
       }
 
       if (isVISCCall_getNodeInstanceID_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_x,
+                                 &toBeErased);
       }
       if (isVISCCall_getNodeInstanceID_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_y,
+                                 &toBeErased);
       }
       if (isVISCCall_getNodeInstanceID_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNodeInstanceID_z,
+                                 &toBeErased);
       }
       if (isVISCCall_getNumNodeInstances_x(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_x,
+                                 &toBeErased);
       }
       if (isVISCCall_getNumNodeInstances_y(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_y,
+                                 &toBeErased);
       }
       if (isVISCCall_getNumNodeInstances_z(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_getNumNodeInstances_z,
+                                 &toBeErased);
       }
       if (isVISCCall_atomic_cmpxchg(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_cmpxchg,
+                                 &toBeErased);
       }
       if (isVISCCall_atomic_add(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::visc_atomic_add, &toBeErased);
@@ -706,7 +727,8 @@ bool GenVISC::runOnModule(Module &M) {
         ReplaceCallWithIntrinsic(I, Intrinsic::floor, &toBeErased);
       }
       if (isVISCCall_rsqrt(I)) {
-        ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f, &toBeErased);
+        ReplaceCallWithIntrinsic(I, Intrinsic::nvvm_rsqrt_approx_f,
+                                 &toBeErased);
       }
       if (isVISCCall_sqrt(I)) {
         ReplaceCallWithIntrinsic(I, Intrinsic::sqrt, &toBeErased);
@@ -721,148 +743,155 @@ bool GenVISC::runOnModule(Module &M) {
 
     // Erase the __visc__node calls
     DEBUG(errs() << "Erase " << toBeErased.size() << " Statements:\n");
-    for(auto I: toBeErased) {
+    for (auto I : toBeErased) {
       DEBUG(errs() << *I << "\n");
     }
-    while(!toBeErased.empty()) {
-      Instruction* I = toBeErased.back(); 
+    while (!toBeErased.empty()) {
+      Instruction *I = toBeErased.back();
       DEBUG(errs() << "\tErasing " << *I << "\n");
       I->eraseFromParent();
-      toBeErased.pop_back(); 
+      toBeErased.pop_back();
     }
 
-    if(bind == mutateTypeCause::mtc_BIND || bind == mutateTypeCause::mtc_RETURN) {
-        DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
-        // Argument type list.
-        std::vector<Type*> FArgTypes;
-        for(Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
-            ai != ae; ++ai) {
-          FArgTypes.push_back(ai->getType());
-        }
-
-        // Find new return type of function
-        Type* NewReturnTy;
-        if(bind == mutateTypeCause::mtc_BIND) {
-
-          std::vector<Type*> TyList;
-          for (unsigned i = 0; i < FRetTypes.size(); i++)
-            TyList.push_back(FRetTypes[i]);
-
-          NewReturnTy = StructType::create(f->getContext(), TyList, Twine("struct.out."+f->getName()).str(), true);
-        }
-        else {
-          NewReturnTy = getReturnTypeFromReturnInst(f);
-          assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
-        }
-
-        FunctionType* FTy = FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
-
-        // Change the function type
-        Function* newF = cloneFunction(f, FTy, false);
-        DEBUG(errs() << *newF << "\n");
-
-        if (bind == mutateTypeCause::mtc_BIND) {
-          // This is certainly an internal node, and hence just one BB with one
-          // return terminator instruction. Change return statement
-          ReturnInst* RI = cast<ReturnInst>(newF->getEntryBlock().getTerminator());
-          ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(NewReturnTy));
-          ReplaceInstWithInst(RI, newRI);        
-        }
-        if (bind == mutateTypeCause::mtc_RETURN) {
-          // Nothing
-        }
-        replaceNodeFunctionInIR(*f->getParent(), f, newF);
-        DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
+    if (bind == mutateTypeCause::mtc_BIND ||
+        bind == mutateTypeCause::mtc_RETURN) {
+      DEBUG(errs() << "Function before fixing return type\n" << *f << "\n");
+      // Argument type list.
+      std::vector<Type *> FArgTypes;
+      for (Function::const_arg_iterator ai = f->arg_begin(), ae = f->arg_end();
+           ai != ae; ++ai) {
+        FArgTypes.push_back(ai->getType());
+      }
+
+      // Find new return type of function
+      Type *NewReturnTy;
+      if (bind == mutateTypeCause::mtc_BIND) {
+
+        std::vector<Type *> TyList;
+        for (unsigned i = 0; i < FRetTypes.size(); i++)
+          TyList.push_back(FRetTypes[i]);
+
+        NewReturnTy =
+            StructType::create(f->getContext(), TyList,
+                               Twine("struct.out." + f->getName()).str(), true);
+      } else {
+        NewReturnTy = getReturnTypeFromReturnInst(f);
+        assert(NewReturnTy->isStructTy() && "Expecting a struct type!");
+      }
+
+      FunctionType *FTy =
+          FunctionType::get(NewReturnTy, FArgTypes, f->isVarArg());
+
+      // Change the function type
+      Function *newF = cloneFunction(f, FTy, false);
+      DEBUG(errs() << *newF << "\n");
+
+      if (bind == mutateTypeCause::mtc_BIND) {
+        // This is certainly an internal node, and hence just one BB with one
+        // return terminator instruction. Change return statement
+        ReturnInst *RI =
+            cast<ReturnInst>(newF->getEntryBlock().getTerminator());
+        ReturnInst *newRI = ReturnInst::Create(newF->getContext(),
+                                               UndefValue::get(NewReturnTy));
+        ReplaceInstWithInst(RI, newRI);
+      }
+      if (bind == mutateTypeCause::mtc_RETURN) {
+        // Nothing
+      }
+      replaceNodeFunctionInIR(*f->getParent(), f, newF);
+      DEBUG(errs() << "Function after fixing return type\n" << *newF << "\n");
     }
-
-
   }
-  return false; //TODO: What does returning "false" mean?
+  return false; // TODO: What does returning "false" mean?
 }
 
 // Generate Code for declaring a constant string [L x i8] and return a pointer
 // to the start of it.
-Value* GenVISC::getStringPointer(const Twine& S, Instruction* IB, const Twine& Name) {
-  Constant* SConstant = ConstantDataArray::getString(M->getContext(), S.str(), true);
-  Value* SGlobal = new GlobalVariable(*M, SConstant->getType(), true,
-                                      GlobalValue::InternalLinkage, SConstant, Name);
-  Value* Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
-  Value* GEPArgs[] = {Zero, Zero};
-  GetElementPtrInst* SPtr = GetElementPtrInst::Create(nullptr, SGlobal,
-                            ArrayRef<Value*>(GEPArgs, 2), Name+"Ptr", IB);
+Value *GenVISC::getStringPointer(const Twine &S, Instruction *IB,
+                                 const Twine &Name) {
+  Constant *SConstant =
+      ConstantDataArray::getString(M->getContext(), S.str(), true);
+  Value *SGlobal =
+      new GlobalVariable(*M, SConstant->getType(), true,
+                         GlobalValue::InternalLinkage, SConstant, Name);
+  Value *Zero = ConstantInt::get(Type::getInt64Ty(M->getContext()), 0);
+  Value *GEPArgs[] = {Zero, Zero};
+  GetElementPtrInst *SPtr = GetElementPtrInst::Create(
+      nullptr, SGlobal, ArrayRef<Value *>(GEPArgs, 2), Name + "Ptr", IB);
   return SPtr;
 }
 
-void GenVISC::initializeTimerSet(Instruction* InsertBefore) {
-  Value* TimerSetAddr;
-  StoreInst* SI;
-  TIMER(TimerSet = new GlobalVariable(*M,
-                                      Type::getInt8PtrTy(M->getContext()),
-                                      false,
-                                      GlobalValue::CommonLinkage,
-                                      Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
-                                      "viscTimerSet_GenVISC"));
-  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet << "\n");
-  //DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet << "\n");
-
-  TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet,
-                                        None,
-                                        "",
+void GenVISC::initializeTimerSet(Instruction *InsertBefore) {
+  Value *TimerSetAddr;
+  StoreInst *SI;
+  TIMER(TimerSet = new GlobalVariable(
+            *M, Type::getInt8PtrTy(M->getContext()), false,
+            GlobalValue::CommonLinkage,
+            Constant::getNullValue(Type::getInt8PtrTy(M->getContext())),
+            "viscTimerSet_GenVISC"));
+  DEBUG(errs() << "Inserting GV: " << *TimerSet->getType() << *TimerSet
+               << "\n");
+  // DEBUG(errs() << "Inserting call to: " << *llvm_visc_initializeTimerSet <<
+  // "\n");
+
+  TIMER(TimerSetAddr = CallInst::Create(llvm_visc_initializeTimerSet, None, "",
                                         InsertBefore));
   DEBUG(errs() << "TimerSetAddress = " << *TimerSetAddr << "\n");
   TIMER(SI = new StoreInst(TimerSetAddr, TimerSet, InsertBefore));
   DEBUG(errs() << "Store Timer Address in Global variable: " << *SI << "\n");
 }
 
-void GenVISC::switchToTimer(enum visc_TimerID timer, Instruction* InsertBefore) {
-  Value* switchArgs[] = {TimerSet, getTimerID(*M, timer)};
+void GenVISC::switchToTimer(enum visc_TimerID timer,
+                            Instruction *InsertBefore) {
+  Value *switchArgs[] = {TimerSet, getTimerID(*M, timer)};
   TIMER(CallInst::Create(llvm_visc_switchToTimer,
-                         ArrayRef<Value*>(switchArgs, 2),
-                         "",
-                         InsertBefore));
+                         ArrayRef<Value *>(switchArgs, 2), "", InsertBefore));
 }
 
-void GenVISC::printTimerSet(Instruction* InsertBefore) {
-  Value* TimerName;
+void GenVISC::printTimerSet(Instruction *InsertBefore) {
+  Value *TimerName;
   TIMER(TimerName = getStringPointer("GenVISC_Timer", InsertBefore));
-  Value* printArgs[] = {TimerSet, TimerName};
+  Value *printArgs[] = {TimerSet, TimerName};
   TIMER(CallInst::Create(llvm_visc_printTimerSet,
-                         ArrayRef<Value*>(printArgs, 2),
-                         "",
-                         InsertBefore));
+                         ArrayRef<Value *>(printArgs, 2), "", InsertBefore));
 }
 
-static inline ConstantInt* getTimerID(Module& M, enum visc_TimerID timer) {
+static inline ConstantInt *getTimerID(Module &M, enum visc_TimerID timer) {
   return ConstantInt::get(Type::getInt32Ty(M.getContext()), timer);
 }
 
-static Function* transformReturnTypeToStruct(Function* F) {
+static Function *transformReturnTypeToStruct(Function *F) {
   // Currently only works for void return types
-  DEBUG(errs() << "Transforming return type of function to Struct: " << F->getName() << "\n");
+  DEBUG(errs() << "Transforming return type of function to Struct: "
+               << F->getName() << "\n");
 
   if (isa<StructType>(F->getReturnType())) {
-    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": " << *F->getReturnType() << "\n");
+    DEBUG(errs() << "Return type is already a Struct: " << F->getName() << ": "
+                 << *F->getReturnType() << "\n");
     return F;
   }
 
-  assert(F->getReturnType()->isVoidTy() && "Unhandled case - Only void return type handled\n");
+  assert(F->getReturnType()->isVoidTy() &&
+         "Unhandled case - Only void return type handled\n");
 
   // Create the argument type list with added argument types
-  std::vector<Type*> ArgTypes;
-  for(Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
-      ai != ae; ++ai) {
+  std::vector<Type *> ArgTypes;
+  for (Function::const_arg_iterator ai = F->arg_begin(), ae = F->arg_end();
+       ai != ae; ++ai) {
     ArgTypes.push_back(ai->getType());
   }
-  
-  StructType* RetTy = StructType::create(F->getContext(), None, "emptyStruct", true);
-  FunctionType* FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
-  
-  SmallVector<ReturnInst*, 8> Returns;
-  Function* newF = cloneFunction(F, FTy, false, &Returns);
+
+  StructType *RetTy =
+      StructType::create(F->getContext(), None, "emptyStruct", true);
+  FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, F->isVarArg());
+
+  SmallVector<ReturnInst *, 8> Returns;
+  Function *newF = cloneFunction(F, FTy, false, &Returns);
   // Replace ret void instruction with ret %RetTy undef
-  for(auto &RI: Returns) {
-    DEBUG(errs() << "Found return inst: "<< *RI << "\n");
-    ReturnInst* newRI = ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
+  for (auto &RI : Returns) {
+    DEBUG(errs() << "Found return inst: " << *RI << "\n");
+    ReturnInst *newRI =
+        ReturnInst::Create(newF->getContext(), UndefValue::get(RetTy));
     ReplaceInstWithInst(RI, newRI);
   }
 
@@ -870,19 +899,20 @@ static Function* transformReturnTypeToStruct(Function* F) {
   return newF;
 }
 
-static Type* getReturnTypeFromReturnInst(Function* F) {
-  for(BasicBlock &BB: *F) {
-    if(ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType() << "\n");
+static Type *getReturnTypeFromReturnInst(Function *F) {
+  for (BasicBlock &BB : *F) {
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      DEBUG(errs() << "Return type value: " << *RI->getReturnValue()->getType()
+                   << "\n");
       return RI->getReturnValue()->getType();
     }
   }
 }
 
-
 char genvisc::GenVISC::ID = 0;
-static RegisterPass<genvisc::GenVISC> X("genvisc", "Pass to generate VISC IR from LLVM IR (with dummy function calls)", false, false);
+static RegisterPass<genvisc::GenVISC>
+    X("genvisc",
+      "Pass to generate VISC IR from LLVM IR (with dummy function calls)",
+      false, false);
 
 } // End of namespace genvisc
-
-
diff --git a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
index 359ee74d41a64ae0b2aeb025d9d94c55feaac7b8..7bd66b62c6c8cda589fe3e6c1e3711893aceaffb 100644
--- a/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
+++ b/hpvm/lib/Transforms/LocalMem/LocalMem.cpp
@@ -8,18 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "LocalMem"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
+#include "SupportVISC/DFG2LLVM.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Linker/Linker.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Constant.h"
-#include "SupportVISC/DFG2LLVM.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 
 using namespace llvm;
 using namespace builddfg;
@@ -28,7 +28,7 @@ using namespace dfg2llvm;
 namespace {
 // Helper Functions
 
-static AllocationNodeProperty* isAllocationNode(DFLeafNode* N);
+static AllocationNodeProperty *isAllocationNode(DFLeafNode *N);
 
 // LocalMem - The first implementation.
 struct LocalMem : public ModulePass {
@@ -53,23 +53,22 @@ public:
 class AT_OCL : public CodeGenTraversal {
 
 private:
-  //Member variables
+  // Member variables
 
-  //Functions
+  // Functions
 
   // Virtual Functions
   void init() {}
   void initRuntimeAPI() {}
-  void codeGen(DFInternalNode* N);
-  void codeGen(DFLeafNode* N);
+  void codeGen(DFInternalNode *N);
+  void codeGen(DFLeafNode *N);
 
 public:
   // Constructor
   AT_OCL(Module &_M, BuildDFG &_DFG) : CodeGenTraversal(_M, _DFG) {
-    //init();
-    //initRuntimeAPI();
+    // init();
+    // initRuntimeAPI();
   }
-
 };
 
 bool LocalMem::runOnModule(Module &M) {
@@ -80,8 +79,8 @@ bool LocalMem::runOnModule(Module &M) {
   // - Maps from i8* hansles to DFNode and DFEdge
   BuildDFG &DFG = getAnalysis<BuildDFG>();
 
-  //DFInternalNode *Root = DFG.getRoot();
-  std::vector<DFInternalNode*> Roots = DFG.getRoots();
+  // DFInternalNode *Root = DFG.getRoot();
+  std::vector<DFInternalNode *> Roots = DFG.getRoots();
   // BuildDFG::HandleToDFNode &HandleToDFNodeMap = DFG.getHandleToDFNodeMap();
   // BuildDFG::HandleToDFEdge &HandleToDFEdgeMap = DFG.getHandleToDFEdgeMap();
 
@@ -89,102 +88,103 @@ bool LocalMem::runOnModule(Module &M) {
   AT_OCL *ATVisitor = new AT_OCL(M, DFG);
 
   // Iterate over all the DFGs and produce code for each one of them
-  for (auto rootNode: Roots) {
+  for (auto rootNode : Roots) {
     // Initiate code generation for root DFNode
     ATVisitor->visit(rootNode);
-    // Go ahead and replace the launch intrinsic with pthread call, otherwise return now.
+    // Go ahead and replace the launch intrinsic with pthread call, otherwise
+    // return now.
     // TODO: Later on, we might like to do this in a separate pass, which would
-    // allow us the flexibility to switch between complete static code generation
-    // for DFG or having a customized runtime+scheduler
+    // allow us the flexibility to switch between complete static code
+    // generation for DFG or having a customized runtime+scheduler
   }
 
   delete ATVisitor;
   return true;
 }
 
-void AT_OCL::codeGen(DFInternalNode* N) {
+void AT_OCL::codeGen(DFInternalNode *N) {
   DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
 }
 
 // Code generation for leaf nodes
-void AT_OCL::codeGen(DFLeafNode* N) {
+void AT_OCL::codeGen(DFLeafNode *N) {
   DEBUG(errs() << "Analysing Node: " << N->getFuncPointer()->getName() << "\n");
   // Skip code generation if it is a dummy node
-  if(N->isDummyNode()) {
+  if (N->isDummyNode()) {
     DEBUG(errs() << "Skipping dummy node\n");
     return;
   }
 
   // Check and mark as allocation node
-  AllocationNodeProperty* ANP = isAllocationNode(N);
-  if(ANP != NULL) {
+  AllocationNodeProperty *ANP = isAllocationNode(N);
+  if (ANP != NULL) {
     // set Properties of the allocation node
     N->setProperty(DFNode::Allocation, ANP);
-    AllocationNodeProperty* anp = (AllocationNodeProperty*) N->getProperty(DFNode::Allocation);
+    AllocationNodeProperty *anp =
+        (AllocationNodeProperty *)N->getProperty(DFNode::Allocation);
     AllocationNodeProperty::AllocationListType AL = anp->getAllocationList();
     DEBUG(errs() << "Total allocations = " << AL.size() << "\n");
-    for(auto P: AL) {
+    for (auto P : AL) {
       DEBUG(errs() << " EdgePort: " << P.first->getDestPosition());
       DEBUG(errs() << " Size: " << *P.second << "\n");
-    } 
-
+    }
   }
 }
 
-// Return pointer to property if this leaf node matches the conditions for being an allocation
-// node.
-// Conditions 
+// Return pointer to property if this leaf node matches the conditions for being
+// an allocation node. Conditions
 // 1. No incoming memory pointer. No in/out attribute on a pointer argument
 // 2. Uses visc malloc intrinsic to allocate memory
 // 3. Sends it out
 // 2. (TODO:) Whether the allocated pointer escapes the parent node
-AllocationNodeProperty* isAllocationNode(DFLeafNode* N) {
+AllocationNodeProperty *isAllocationNode(DFLeafNode *N) {
   // Allocation node must be free from side-effects
-  if(N->hasSideEffects())
+  if (N->hasSideEffects())
     return NULL;
 
   // Allocation node must have some outgoing edges
-  if(N->getOutputType()->isEmptyTy())
+  if (N->getOutputType()->isEmptyTy())
     return NULL;
 
-  Function* F = N->getFuncPointer();
-  
+  Function *F = N->getFuncPointer();
+
   // Allocation node must use visc malloc intrinsic
   bool usesVISCMalloc = false;
-  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) {
-    Instruction* I = &*i;
-    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(I)) {
-      if(II->getIntrinsicID() == Intrinsic::visc_malloc) {
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; i++) {
+    Instruction *I = &*i;
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+      if (II->getIntrinsicID() == Intrinsic::visc_malloc) {
         usesVISCMalloc = true;
         break;
       }
     }
-  } 
-  if(!usesVISCMalloc)
+  }
+  if (!usesVISCMalloc)
     return NULL;
 
   // TODO: Check if allocated pointer leaves parent node
-  
+
   // This is an allocation node
-  AllocationNodeProperty* ANP = new AllocationNodeProperty();
+  AllocationNodeProperty *ANP = new AllocationNodeProperty();
   // Find the return statement.
   // FIXME: For now, assuming their is just one BB. Terminator instruction of
   // this BB is a return statement. The value returned is what we need
-  BasicBlock& BB = F->getEntryBlock();
-  assert(isa<ReturnInst>(BB.getTerminator())
-      && "Currently we do not handle the case where Allocation Node has multiple BB");
-  ReturnInst* RI = dyn_cast<ReturnInst>(BB.getTerminator());
+  BasicBlock &BB = F->getEntryBlock();
+  assert(isa<ReturnInst>(BB.getTerminator()) &&
+         "Currently we do not handle the case where Allocation Node has "
+         "multiple BB");
+  ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
   // Find the returned struct
-  Value* val = RI->getReturnValue();
-  std::vector<Value*> OutValues(6, NULL);
+  Value *val = RI->getReturnValue();
+  std::vector<Value *> OutValues(6, NULL);
   unsigned numOutputs = N->getOutputType()->getNumElements();
-  for(unsigned i = 0; i < numOutputs; i++) {
-    if(InsertValueInst* IV = dyn_cast<InsertValueInst>(val)) {
-      DEBUG(errs() << "Value at out edge" << numOutputs-1-i << ": " << *val << "\n");
-      OutValues[numOutputs-1-i] = IV->getOperand(1);
+  for (unsigned i = 0; i < numOutputs; i++) {
+    if (InsertValueInst *IV = dyn_cast<InsertValueInst>(val)) {
+      DEBUG(errs() << "Value at out edge" << numOutputs - 1 - i << ": " << *val
+                   << "\n");
+      OutValues[numOutputs - 1 - i] = IV->getOperand(1);
       val = IV->getOperand(0);
-    }
-    else {
+    } else {
       DEBUG(errs() << "Unexpected value at out edge: " << *val << "\n");
       llvm_unreachable("Expecting InsertValue instruction. Error!");
     }
@@ -192,33 +192,34 @@ AllocationNodeProperty* isAllocationNode(DFLeafNode* N) {
   // OutValues vector contains all the values that will go out
   // Assume that the Allocation node only sends the pointers and their sizes
   // forward
-  unsigned i=0;
-  while(i < numOutputs) {
-    assert(OutValues[i]->getType()->isPointerTy()
-        && "Expected outgoing edge to be of pointer type");
-    if(IntrinsicInst* II = dyn_cast<IntrinsicInst>(OutValues[i])) {
-      if(II->getIntrinsicID() == Intrinsic::visc_malloc) {
+  unsigned i = 0;
+  while (i < numOutputs) {
+    assert(OutValues[i]->getType()->isPointerTy() &&
+           "Expected outgoing edge to be of pointer type");
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(OutValues[i])) {
+      if (II->getIntrinsicID() == Intrinsic::visc_malloc) {
         // Sanity check: Size passed to malloc intrinsic is same as the value
         // going into the next outgoing edge
-        DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n"); 
-        DEBUG(errs() << "Out edge value: " << *OutValues[i+1] << "\n"); 
-        assert(II->getArgOperand(0) == OutValues[i+1]
-            && "Sanity Check Failed: VISC Malloc size argument != next outgoing edge");
+        DEBUG(errs() << "Visc malloc size: " << *II->getArgOperand(0) << "\n");
+        DEBUG(errs() << "Out edge value: " << *OutValues[i + 1] << "\n");
+        assert(II->getArgOperand(0) == OutValues[i + 1] &&
+               "Sanity Check Failed: VISC Malloc size argument != next "
+               "outgoing edge");
         ANP->insertAllocation(N->getOutDFEdgeAt(i), II->getArgOperand(0));
-        i = i+2;
+        i = i + 2;
         continue;
       }
     }
     llvm_unreachable("Expecting visc malloc intrinsic instruction!");
-  } 
+  }
   return ANP;
 }
 
 } // End of namespace
 
 char LocalMem::ID = 0;
-static RegisterPass<LocalMem> X("localmem",
-                                    "Pass to identifying nodes amenable to local memory allocation",
-                                    false /* does not modify the CFG */,
-                                    true /* transformation, not just analysis */);
-
+static RegisterPass<LocalMem>
+    X("localmem",
+      "Pass to identifying nodes amenable to local memory allocation",
+      false /* does not modify the CFG */,
+      true /* transformation, not just analysis */);
diff --git a/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h b/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h
index 4870066fe4e10a6b0a02d50c7a79639103219be6..5e59ba96f2331663289a040326ebd4e453bd1e86 100644
--- a/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h
+++ b/hpvm/llvm_patches/include/Bitcode/LLVMBitCodes.h
@@ -331,7 +331,7 @@ enum MetadataCodes {
   METADATA_INDEX_OFFSET = 38,           // [offset]
   METADATA_INDEX = 39,                  // [bitpos]
   METADATA_LABEL = 40,                  // [distinct, scope, name, file, line]
-  METADATA_COMMON_BLOCK = 44,     // [distinct, scope, name, variable,...]
+  METADATA_COMMON_BLOCK = 44,           // [distinct, scope, name, variable,...]
 };
 
 // The constants block (CONSTANTS_BLOCK_ID) describes emission for each
@@ -363,7 +363,7 @@ enum ConstantsCodes {
   CST_CODE_INLINEASM = 23,       // INLINEASM:     [sideeffect|alignstack|
                                  //                 asmdialect,asmstr,conststr]
   CST_CODE_CE_GEP_WITH_INRANGE_INDEX = 24, //      [opty, flags, n x operands]
-  CST_CODE_CE_UNOP = 25,         // CE_UNOP:      [opcode, opval]
+  CST_CODE_CE_UNOP = 25,                   // CE_UNOP:      [opcode, opval]
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
@@ -390,9 +390,7 @@ enum CastOpcodes {
 /// unop a CST_CODE_CE_UNOP or a XXX refers to.  The values of these enums
 /// have no fixed relation to the LLVM IR enum values.  Changing these will
 /// break compatibility with old files.
-enum UnaryOpcodes {
-  UNOP_NEG = 0
-};
+enum UnaryOpcodes { UNOP_NEG = 0 };
 
 /// BinaryOpcodes - These are values used in the bitcode files to encode which
 /// binop a CST_CODE_CE_BINOP or a XXX refers to.  The values of these enums
@@ -444,14 +442,14 @@ enum OverflowingBinaryOperatorOptionalFlags {
 /// This is a fixed layout derived from the bitcode emitted by LLVM 5.0
 /// intended to decouple the in-memory representation from the serialization.
 enum FastMathMap {
-  UnsafeAlgebra   = (1 << 0), // Legacy
-  NoNaNs          = (1 << 1),
-  NoInfs          = (1 << 2),
-  NoSignedZeros   = (1 << 3),
+  UnsafeAlgebra = (1 << 0), // Legacy
+  NoNaNs = (1 << 1),
+  NoInfs = (1 << 2),
+  NoSignedZeros = (1 << 3),
   AllowReciprocal = (1 << 4),
-  AllowContract   = (1 << 5),
-  ApproxFunc      = (1 << 6),
-  AllowReassoc    = (1 << 7)
+  AllowContract = (1 << 5),
+  ApproxFunc = (1 << 6),
+  AllowReassoc = (1 << 7)
 };
 
 /// PossiblyExactOperatorOptionalFlags - Flags for serializing
@@ -653,7 +651,7 @@ enum SymtabCodes {
   SYMTAB_BLOB = 1,
 };
 
-} // End bitc namespace
-} // End llvm namespace
+} // namespace bitc
+} // namespace llvm
 
 #endif
diff --git a/hpvm/llvm_patches/include/Support/Debug.h b/hpvm/llvm_patches/include/Support/Debug.h
index 277a6d7b89336841779941ca5034e116eb14ebfc..25c031100e29d06b594966c7378d190c73ea09fb 100644
--- a/hpvm/llvm_patches/include/Support/Debug.h
+++ b/hpvm/llvm_patches/include/Support/Debug.h
@@ -61,15 +61,20 @@ void setCurrentDebugTypes(const char **Types, unsigned Count);
 ///
 /// This will emit the debug information if -debug is present, and -debug-only
 /// is not specified, or is specified as "bitset".
-#define DEBUG_WITH_TYPE(TYPE, X)                                        \
-  do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)) { X; } \
+#define DEBUG_WITH_TYPE(TYPE, X)                                               \
+  do {                                                                         \
+    if (::llvm::DebugFlag && ::llvm::isCurrentDebugType(TYPE)) {               \
+      X;                                                                       \
+    }                                                                          \
   } while (false)
 
 #else
 #define isCurrentDebugType(X) (false)
 #define setCurrentDebugType(X)
 #define setCurrentDebugTypes(X, N)
-#define DEBUG_WITH_TYPE(TYPE, X) do { } while (false)
+#define DEBUG_WITH_TYPE(TYPE, X)                                               \
+  do {                                                                         \
+  } while (false)
 #endif
 
 /// This boolean is set to true if the '-debug' command line option
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
index 3861f4c7a22c9bc8e473d67e118357c5706c00d8..a924405a2cac85ccd2e5e903a1ee1abb52774566 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
+++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.cpp
@@ -47,8 +47,8 @@ uint64_t LLLexer::atoull(const char *Buffer, const char *End) {
   for (; Buffer != End; Buffer++) {
     uint64_t OldRes = Result;
     Result *= 10;
-    Result += *Buffer-'0';
-    if (Result < OldRes) {  // Uh, oh, overflow detected!!!
+    Result += *Buffer - '0';
+    if (Result < OldRes) { // Uh, oh, overflow detected!!!
       Error("constant bigger than 64 bits detected!");
       return 0;
     }
@@ -63,7 +63,7 @@ uint64_t LLLexer::HexIntToVal(const char *Buffer, const char *End) {
     Result *= 16;
     Result += hexDigitValue(*Buffer);
 
-    if (Result < OldRes) {   // Uh, oh, overflow detected!!!
+    if (Result < OldRes) { // Uh, oh, overflow detected!!!
       Error("constant bigger than 64 bits detected!");
       return 0;
     }
@@ -93,9 +93,9 @@ void LLLexer::HexToIntPair(const char *Buffer, const char *End,
 /// FP80HexToIntPair - translate an 80 bit FP80 number (20 hexits) into
 /// { low64, high16 } as usual for an APInt.
 void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End,
-                           uint64_t Pair[2]) {
+                               uint64_t Pair[2]) {
   Pair[1] = 0;
-  for (int i=0; i<4 && Buffer != End; i++, Buffer++) {
+  for (int i = 0; i < 4 && Buffer != End; i++, Buffer++) {
     assert(Buffer != End);
     Pair[1] *= 16;
     Pair[1] += hexDigitValue(*Buffer);
@@ -112,20 +112,21 @@ void LLLexer::FP80HexToIntPair(const char *Buffer, const char *End,
 // UnEscapeLexed - Run through the specified buffer and change \xx codes to the
 // appropriate character.
 static void UnEscapeLexed(std::string &Str) {
-  if (Str.empty()) return;
+  if (Str.empty())
+    return;
 
-  char *Buffer = &Str[0], *EndBuffer = Buffer+Str.size();
+  char *Buffer = &Str[0], *EndBuffer = Buffer + Str.size();
   char *BOut = Buffer;
-  for (char *BIn = Buffer; BIn != EndBuffer; ) {
+  for (char *BIn = Buffer; BIn != EndBuffer;) {
     if (BIn[0] == '\\') {
-      if (BIn < EndBuffer-1 && BIn[1] == '\\') {
+      if (BIn < EndBuffer - 1 && BIn[1] == '\\') {
         *BOut++ = '\\'; // Two \ becomes one
         BIn += 2;
-      } else if (BIn < EndBuffer-2 &&
+      } else if (BIn < EndBuffer - 2 &&
                  isxdigit(static_cast<unsigned char>(BIn[1])) &&
                  isxdigit(static_cast<unsigned char>(BIn[2]))) {
         *BOut = hexDigitValue(BIn[1]) * 16 + hexDigitValue(BIn[2]);
-        BIn += 3;                           // Skip over handled chars
+        BIn += 3; // Skip over handled chars
         ++BOut;
       } else {
         *BOut++ = *BIn++;
@@ -134,7 +135,7 @@ static void UnEscapeLexed(std::string &Str) {
       *BOut++ = *BIn++;
     }
   }
-  Str.resize(BOut-Buffer);
+  Str.resize(BOut - Buffer);
 }
 
 /// isLabelChar - Return true for [-a-zA-Z$._0-9].
@@ -146,8 +147,10 @@ static bool isLabelChar(char C) {
 /// isLabelTail - Return true if this pointer points to a valid end of a label.
 static const char *isLabelTail(const char *CurPtr) {
   while (true) {
-    if (CurPtr[0] == ':') return CurPtr+1;
-    if (!isLabelChar(CurPtr[0])) return nullptr;
+    if (CurPtr[0] == ':')
+      return CurPtr + 1;
+    if (!isLabelChar(CurPtr[0]))
+      return nullptr;
     ++CurPtr;
   }
 }
@@ -166,15 +169,16 @@ LLLexer::LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &Err,
 int LLLexer::getNextChar() {
   char CurChar = *CurPtr++;
   switch (CurChar) {
-  default: return (unsigned char)CurChar;
+  default:
+    return (unsigned char)CurChar;
   case 0:
     // A nul character in the stream is either the end of the current buffer or
     // a random nul in the file.  Disambiguate that here.
-    if (CurPtr-1 != CurBuf.end())
-      return 0;  // Just whitespace.
+    if (CurPtr - 1 != CurBuf.end())
+      return 0; // Just whitespace.
 
     // Otherwise, return end of file.
-    --CurPtr;  // Another call to lex will return EOF again.
+    --CurPtr; // Another call to lex will return EOF again.
     return EOF;
   }
 }
@@ -191,7 +195,8 @@ lltok::Kind LLLexer::LexToken() {
         return LexIdentifier();
 
       return lltok::Error;
-    case EOF: return lltok::Eof;
+    case EOF:
+      return lltok::Eof;
     case 0:
     case ' ':
     case '\t':
@@ -199,15 +204,20 @@ lltok::Kind LLLexer::LexToken() {
     case '\r':
       // Ignore whitespace.
       continue;
-    case '+': return LexPositive();
-    case '@': return LexAt();
-    case '$': return LexDollar();
-    case '%': return LexPercent();
-    case '"': return LexQuote();
+    case '+':
+      return LexPositive();
+    case '@':
+      return LexAt();
+    case '$':
+      return LexDollar();
+    case '%':
+      return LexPercent();
+    case '"':
+      return LexQuote();
     case '.':
       if (const char *Ptr = isLabelTail(CurPtr)) {
         CurPtr = Ptr;
-        StrVal.assign(TokStart, CurPtr-1);
+        StrVal.assign(TokStart, CurPtr - 1);
         return lltok::LabelStr;
       }
       if (CurPtr[0] == '.' && CurPtr[1] == '.') {
@@ -218,28 +228,50 @@ lltok::Kind LLLexer::LexToken() {
     case ';':
       SkipLineComment();
       continue;
-    case '!': return LexExclaim();
+    case '!':
+      return LexExclaim();
     case '^':
       return LexCaret();
     case ':':
       return lltok::colon;
-    case '#': return LexHash();
-    case '0': case '1': case '2': case '3': case '4':
-    case '5': case '6': case '7': case '8': case '9':
+    case '#':
+      return LexHash();
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
     case '-':
       return LexDigitOrNegative();
-    case '=': return lltok::equal;
-    case '[': return lltok::lsquare;
-    case ']': return lltok::rsquare;
-    case '{': return lltok::lbrace;
-    case '}': return lltok::rbrace;
-    case '<': return lltok::less;
-    case '>': return lltok::greater;
-    case '(': return lltok::lparen;
-    case ')': return lltok::rparen;
-    case ',': return lltok::comma;
-    case '*': return lltok::star;
-    case '|': return lltok::bar;
+    case '=':
+      return lltok::equal;
+    case '[':
+      return lltok::lsquare;
+    case ']':
+      return lltok::rsquare;
+    case '{':
+      return lltok::lbrace;
+    case '}':
+      return lltok::rbrace;
+    case '<':
+      return lltok::less;
+    case '>':
+      return lltok::greater;
+    case '(':
+      return lltok::lparen;
+    case ')':
+      return lltok::rparen;
+    case ',':
+      return lltok::comma;
+    case '*':
+      return lltok::star;
+    case '|':
+      return lltok::bar;
     }
   }
 }
@@ -307,7 +339,7 @@ lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
       return lltok::Error;
     }
     if (CurChar == '"') {
-      StrVal.assign(Start, CurPtr-1);
+      StrVal.assign(Start, CurPtr - 1);
       UnEscapeLexed(StrVal);
       return kind;
     }
@@ -317,13 +349,11 @@ lltok::Kind LLLexer::ReadString(lltok::Kind kind) {
 /// ReadVarName - Read the rest of a token containing a variable name.
 bool LLLexer::ReadVarName() {
   const char *NameStart = CurPtr;
-  if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
-      CurPtr[0] == '-' || CurPtr[0] == '$' ||
-      CurPtr[0] == '.' || CurPtr[0] == '_') {
+  if (isalpha(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' ||
+      CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_') {
     ++CurPtr;
-    while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
-           CurPtr[0] == '-' || CurPtr[0] == '$' ||
-           CurPtr[0] == '.' || CurPtr[0] == '_')
+    while (isalnum(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' ||
+           CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_')
       ++CurPtr;
 
     StrVal.assign(NameStart, CurPtr);
@@ -361,7 +391,7 @@ lltok::Kind LLLexer::LexVar(lltok::Kind Var, lltok::Kind VarID) {
         return lltok::Error;
       }
       if (CurChar == '"') {
-        StrVal.assign(TokStart+2, CurPtr-1);
+        StrVal.assign(TokStart + 2, CurPtr - 1);
         UnEscapeLexed(StrVal);
         if (StringRef(StrVal).find_first_of(0) != StringRef::npos) {
           Error("Null bytes are not allowed in names");
@@ -414,16 +444,16 @@ lltok::Kind LLLexer::LexQuote() {
 ///    !
 lltok::Kind LLLexer::LexExclaim() {
   // Lex a metadata name as a MetadataVar.
-  if (isalpha(static_cast<unsigned char>(CurPtr[0])) ||
-      CurPtr[0] == '-' || CurPtr[0] == '$' ||
-      CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\') {
+  if (isalpha(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' ||
+      CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_' ||
+      CurPtr[0] == '\\') {
     ++CurPtr;
-    while (isalnum(static_cast<unsigned char>(CurPtr[0])) ||
-           CurPtr[0] == '-' || CurPtr[0] == '$' ||
-           CurPtr[0] == '.' || CurPtr[0] == '_' || CurPtr[0] == '\\')
+    while (isalnum(static_cast<unsigned char>(CurPtr[0])) || CurPtr[0] == '-' ||
+           CurPtr[0] == '$' || CurPtr[0] == '.' || CurPtr[0] == '_' ||
+           CurPtr[0] == '\\')
       ++CurPtr;
 
-    StrVal.assign(TokStart+1, CurPtr);   // Skip !
+    StrVal.assign(TokStart + 1, CurPtr); // Skip !
     UnEscapeLexed(StrVal);
     return lltok::MetadataVar;
   }
@@ -466,13 +496,14 @@ lltok::Kind LLLexer::LexIdentifier() {
   // If we stopped due to a colon, unless we were directed to ignore it,
   // this really is a label.
   if (!IgnoreColonInIdentifiers && *CurPtr == ':') {
-    StrVal.assign(StartChar-1, CurPtr++);
+    StrVal.assign(StartChar - 1, CurPtr++);
     return lltok::LabelStr;
   }
 
   // Otherwise, this wasn't a label.  If this was valid as an integer type,
   // return it.
-  if (!IntEnd) IntEnd = CurPtr;
+  if (!IntEnd)
+    IntEnd = CurPtr;
   if (IntEnd != StartChar) {
     CurPtr = IntEnd;
     uint64_t NumBits = atoull(StartChar, CurPtr);
@@ -486,7 +517,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   }
 
   // Otherwise, this was a letter sequence.  See which keyword this is.
-  if (!KeywordEnd) KeywordEnd = CurPtr;
+  if (!KeywordEnd)
+    KeywordEnd = CurPtr;
   CurPtr = KeywordEnd;
   --StartChar;
   StringRef Keyword(StartChar, CurPtr - StartChar);
@@ -497,9 +529,12 @@ lltok::Kind LLLexer::LexIdentifier() {
       return lltok::kw_##STR;                                                  \
   } while (false)
 
-  KEYWORD(true);    KEYWORD(false);
-  KEYWORD(declare); KEYWORD(define);
-  KEYWORD(global);  KEYWORD(constant);
+  KEYWORD(true);
+  KEYWORD(false);
+  KEYWORD(declare);
+  KEYWORD(define);
+  KEYWORD(global);
+  KEYWORD(constant);
 
   KEYWORD(dso_local);
   KEYWORD(dso_preemptable);
@@ -542,7 +577,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(triple);
   KEYWORD(source_filename);
   KEYWORD(unwind);
-  KEYWORD(deplibs);             // FIXME: Remove in 4.0.
+  KEYWORD(deplibs); // FIXME: Remove in 4.0.
   KEYWORD(datalayout);
   KEYWORD(volatile);
   KEYWORD(atomic);
@@ -703,12 +738,32 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noduplicates);
   KEYWORD(samesize);
 
-  KEYWORD(eq); KEYWORD(ne); KEYWORD(slt); KEYWORD(sgt); KEYWORD(sle);
-  KEYWORD(sge); KEYWORD(ult); KEYWORD(ugt); KEYWORD(ule); KEYWORD(uge);
-  KEYWORD(oeq); KEYWORD(one); KEYWORD(olt); KEYWORD(ogt); KEYWORD(ole);
-  KEYWORD(oge); KEYWORD(ord); KEYWORD(uno); KEYWORD(ueq); KEYWORD(une);
-
-  KEYWORD(xchg); KEYWORD(nand); KEYWORD(max); KEYWORD(min); KEYWORD(umax);
+  KEYWORD(eq);
+  KEYWORD(ne);
+  KEYWORD(slt);
+  KEYWORD(sgt);
+  KEYWORD(sle);
+  KEYWORD(sge);
+  KEYWORD(ult);
+  KEYWORD(ugt);
+  KEYWORD(ule);
+  KEYWORD(uge);
+  KEYWORD(oeq);
+  KEYWORD(one);
+  KEYWORD(olt);
+  KEYWORD(ogt);
+  KEYWORD(ole);
+  KEYWORD(oge);
+  KEYWORD(ord);
+  KEYWORD(uno);
+  KEYWORD(ueq);
+  KEYWORD(une);
+
+  KEYWORD(xchg);
+  KEYWORD(nand);
+  KEYWORD(max);
+  KEYWORD(min);
+  KEYWORD(umax);
   KEYWORD(umin);
 
   KEYWORD(vscale);
@@ -800,10 +855,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(bit);
   KEYWORD(varFlags);
 
-// VISC parameter attributes
-   KEYWORD(in);
-   KEYWORD(out);
-   KEYWORD(inout);
+  // VISC parameter attributes
+  KEYWORD(in);
+  KEYWORD(out);
+  KEYWORD(inout);
 
 #undef KEYWORD
 
@@ -816,17 +871,17 @@ lltok::Kind LLLexer::LexIdentifier() {
     }                                                                          \
   } while (false)
 
-  TYPEKEYWORD("void",      Type::getVoidTy(Context));
-  TYPEKEYWORD("half",      Type::getHalfTy(Context));
-  TYPEKEYWORD("float",     Type::getFloatTy(Context));
-  TYPEKEYWORD("double",    Type::getDoubleTy(Context));
-  TYPEKEYWORD("x86_fp80",  Type::getX86_FP80Ty(Context));
-  TYPEKEYWORD("fp128",     Type::getFP128Ty(Context));
+  TYPEKEYWORD("void", Type::getVoidTy(Context));
+  TYPEKEYWORD("half", Type::getHalfTy(Context));
+  TYPEKEYWORD("float", Type::getFloatTy(Context));
+  TYPEKEYWORD("double", Type::getDoubleTy(Context));
+  TYPEKEYWORD("x86_fp80", Type::getX86_FP80Ty(Context));
+  TYPEKEYWORD("fp128", Type::getFP128Ty(Context));
   TYPEKEYWORD("ppc_fp128", Type::getPPC_FP128Ty(Context));
-  TYPEKEYWORD("label",     Type::getLabelTy(Context));
-  TYPEKEYWORD("metadata",  Type::getMetadataTy(Context));
-  TYPEKEYWORD("x86_mmx",   Type::getX86_MMXTy(Context));
-  TYPEKEYWORD("token",     Type::getTokenTy(Context));
+  TYPEKEYWORD("label", Type::getLabelTy(Context));
+  TYPEKEYWORD("metadata", Type::getMetadataTy(Context));
+  TYPEKEYWORD("x86_mmx", Type::getX86_MMXTy(Context));
+  TYPEKEYWORD("token", Type::getTokenTy(Context));
 
 #undef TYPEKEYWORD
 
@@ -839,62 +894,74 @@ lltok::Kind LLLexer::LexIdentifier() {
     }                                                                          \
   } while (false)
 
-  INSTKEYWORD(fneg,  FNeg);
-
-  INSTKEYWORD(add,   Add);  INSTKEYWORD(fadd,   FAdd);
-  INSTKEYWORD(sub,   Sub);  INSTKEYWORD(fsub,   FSub);
-  INSTKEYWORD(mul,   Mul);  INSTKEYWORD(fmul,   FMul);
-  INSTKEYWORD(udiv,  UDiv); INSTKEYWORD(sdiv,  SDiv); INSTKEYWORD(fdiv,  FDiv);
-  INSTKEYWORD(urem,  URem); INSTKEYWORD(srem,  SRem); INSTKEYWORD(frem,  FRem);
-  INSTKEYWORD(shl,   Shl);  INSTKEYWORD(lshr,  LShr); INSTKEYWORD(ashr,  AShr);
-  INSTKEYWORD(and,   And);  INSTKEYWORD(or,    Or);   INSTKEYWORD(xor,   Xor);
-  INSTKEYWORD(icmp,  ICmp); INSTKEYWORD(fcmp,  FCmp);
-
-  INSTKEYWORD(phi,         PHI);
-  INSTKEYWORD(call,        Call);
-  INSTKEYWORD(trunc,       Trunc);
-  INSTKEYWORD(zext,        ZExt);
-  INSTKEYWORD(sext,        SExt);
-  INSTKEYWORD(fptrunc,     FPTrunc);
-  INSTKEYWORD(fpext,       FPExt);
-  INSTKEYWORD(uitofp,      UIToFP);
-  INSTKEYWORD(sitofp,      SIToFP);
-  INSTKEYWORD(fptoui,      FPToUI);
-  INSTKEYWORD(fptosi,      FPToSI);
-  INSTKEYWORD(inttoptr,    IntToPtr);
-  INSTKEYWORD(ptrtoint,    PtrToInt);
-  INSTKEYWORD(bitcast,     BitCast);
+  INSTKEYWORD(fneg, FNeg);
+
+  INSTKEYWORD(add, Add);
+  INSTKEYWORD(fadd, FAdd);
+  INSTKEYWORD(sub, Sub);
+  INSTKEYWORD(fsub, FSub);
+  INSTKEYWORD(mul, Mul);
+  INSTKEYWORD(fmul, FMul);
+  INSTKEYWORD(udiv, UDiv);
+  INSTKEYWORD(sdiv, SDiv);
+  INSTKEYWORD(fdiv, FDiv);
+  INSTKEYWORD(urem, URem);
+  INSTKEYWORD(srem, SRem);
+  INSTKEYWORD(frem, FRem);
+  INSTKEYWORD(shl, Shl);
+  INSTKEYWORD(lshr, LShr);
+  INSTKEYWORD(ashr, AShr);
+  INSTKEYWORD(and, And);
+  INSTKEYWORD(or, Or);
+  INSTKEYWORD(xor, Xor);
+  INSTKEYWORD(icmp, ICmp);
+  INSTKEYWORD(fcmp, FCmp);
+
+  INSTKEYWORD(phi, PHI);
+  INSTKEYWORD(call, Call);
+  INSTKEYWORD(trunc, Trunc);
+  INSTKEYWORD(zext, ZExt);
+  INSTKEYWORD(sext, SExt);
+  INSTKEYWORD(fptrunc, FPTrunc);
+  INSTKEYWORD(fpext, FPExt);
+  INSTKEYWORD(uitofp, UIToFP);
+  INSTKEYWORD(sitofp, SIToFP);
+  INSTKEYWORD(fptoui, FPToUI);
+  INSTKEYWORD(fptosi, FPToSI);
+  INSTKEYWORD(inttoptr, IntToPtr);
+  INSTKEYWORD(ptrtoint, PtrToInt);
+  INSTKEYWORD(bitcast, BitCast);
   INSTKEYWORD(addrspacecast, AddrSpaceCast);
-  INSTKEYWORD(select,      Select);
-  INSTKEYWORD(va_arg,      VAArg);
-  INSTKEYWORD(ret,         Ret);
-  INSTKEYWORD(br,          Br);
-  INSTKEYWORD(switch,      Switch);
-  INSTKEYWORD(indirectbr,  IndirectBr);
-  INSTKEYWORD(invoke,      Invoke);
-  INSTKEYWORD(resume,      Resume);
+  INSTKEYWORD(select, Select);
+  INSTKEYWORD(va_arg, VAArg);
+  INSTKEYWORD(ret, Ret);
+  INSTKEYWORD(br, Br);
+  INSTKEYWORD(switch, Switch);
+  INSTKEYWORD(indirectbr, IndirectBr);
+  INSTKEYWORD(invoke, Invoke);
+  INSTKEYWORD(resume, Resume);
   INSTKEYWORD(unreachable, Unreachable);
-  INSTKEYWORD(callbr,      CallBr);
-
-  INSTKEYWORD(alloca,      Alloca);
-  INSTKEYWORD(load,        Load);
-  INSTKEYWORD(store,       Store);
-  INSTKEYWORD(cmpxchg,     AtomicCmpXchg);
-  INSTKEYWORD(atomicrmw,   AtomicRMW);
-  INSTKEYWORD(fence,       Fence);
+  INSTKEYWORD(callbr, CallBr);
+
+  INSTKEYWORD(alloca, Alloca);
+  INSTKEYWORD(load, Load);
+  INSTKEYWORD(store, Store);
+  INSTKEYWORD(cmpxchg, AtomicCmpXchg);
+  INSTKEYWORD(atomicrmw, AtomicRMW);
+  INSTKEYWORD(fence, Fence);
   INSTKEYWORD(getelementptr, GetElementPtr);
 
   INSTKEYWORD(extractelement, ExtractElement);
-  INSTKEYWORD(insertelement,  InsertElement);
-  INSTKEYWORD(shufflevector,  ShuffleVector);
-  INSTKEYWORD(extractvalue,   ExtractValue);
-  INSTKEYWORD(insertvalue,    InsertValue);
-  INSTKEYWORD(landingpad,     LandingPad);
-  INSTKEYWORD(cleanupret,     CleanupRet);
-  INSTKEYWORD(catchret,       CatchRet);
-  INSTKEYWORD(catchswitch,  CatchSwitch);
-  INSTKEYWORD(catchpad,     CatchPad);
-  INSTKEYWORD(cleanuppad,   CleanupPad);
+  INSTKEYWORD(insertelement, InsertElement);
+  INSTKEYWORD(shufflevector, ShuffleVector);
+  INSTKEYWORD(extractvalue, ExtractValue);
+  INSTKEYWORD(insertvalue, InsertValue);
+  INSTKEYWORD(landingpad, LandingPad);
+  INSTKEYWORD(cleanupret, CleanupRet);
+  INSTKEYWORD(catchret, CatchRet);
+  INSTKEYWORD(catchswitch, CatchSwitch);
+  INSTKEYWORD(catchpad, CatchPad);
+  INSTKEYWORD(cleanuppad, CleanupPad);
 
 #undef INSTKEYWORD
 
@@ -944,15 +1011,14 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   // Check for [us]0x[0-9A-Fa-f]+ which are Hexadecimal constant generated by
   // the CFE to avoid forcing it to deal with 64-bit numbers.
-  if ((TokStart[0] == 'u' || TokStart[0] == 's') &&
-      TokStart[1] == '0' && TokStart[2] == 'x' &&
-      isxdigit(static_cast<unsigned char>(TokStart[3]))) {
-    int len = CurPtr-TokStart-3;
+  if ((TokStart[0] == 'u' || TokStart[0] == 's') && TokStart[1] == '0' &&
+      TokStart[2] == 'x' && isxdigit(static_cast<unsigned char>(TokStart[3]))) {
+    int len = CurPtr - TokStart - 3;
     uint32_t bits = len * 4;
     StringRef HexStr(TokStart + 3, len);
     if (!all_of(HexStr, isxdigit)) {
       // Bad token, return it as an error.
-      CurPtr = TokStart+3;
+      CurPtr = TokStart + 3;
       return lltok::Error;
     }
     APInt Tmp(bits, HexStr, 16);
@@ -965,12 +1031,12 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   // If this is "cc1234", return this as just "cc".
   if (TokStart[0] == 'c' && TokStart[1] == 'c') {
-    CurPtr = TokStart+2;
+    CurPtr = TokStart + 2;
     return lltok::kw_cc;
   }
 
   // Finally, if this isn't known, return an error.
-  CurPtr = TokStart+1;
+  CurPtr = TokStart + 1;
   return lltok::Error;
 }
 
@@ -993,7 +1059,7 @@ lltok::Kind LLLexer::Lex0x() {
 
   if (!isxdigit(static_cast<unsigned char>(CurPtr[0]))) {
     // Bad token, return it as an error.
-    CurPtr = TokStart+1;
+    CurPtr = TokStart + 1;
     return lltok::Error;
   }
 
@@ -1011,25 +1077,26 @@ lltok::Kind LLLexer::Lex0x() {
 
   uint64_t Pair[2];
   switch (Kind) {
-  default: llvm_unreachable("Unknown kind!");
+  default:
+    llvm_unreachable("Unknown kind!");
   case 'K':
     // F80HexFPConstant - x87 long double in hexadecimal format (10 bytes)
-    FP80HexToIntPair(TokStart+3, CurPtr, Pair);
+    FP80HexToIntPair(TokStart + 3, CurPtr, Pair);
     APFloatVal = APFloat(APFloat::x87DoubleExtended(), APInt(80, Pair));
     return lltok::APFloat;
   case 'L':
     // F128HexFPConstant - IEEE 128-bit in hexadecimal format (16 bytes)
-    HexToIntPair(TokStart+3, CurPtr, Pair);
+    HexToIntPair(TokStart + 3, CurPtr, Pair);
     APFloatVal = APFloat(APFloat::IEEEquad(), APInt(128, Pair));
     return lltok::APFloat;
   case 'M':
     // PPC128HexFPConstant - PowerPC 128-bit in hexadecimal format (16 bytes)
-    HexToIntPair(TokStart+3, CurPtr, Pair);
+    HexToIntPair(TokStart + 3, CurPtr, Pair);
     APFloatVal = APFloat(APFloat::PPCDoubleDouble(), APInt(128, Pair));
     return lltok::APFloat;
   case 'H':
     APFloatVal = APFloat(APFloat::IEEEhalf(),
-                         APInt(16,HexIntToVal(TokStart+3, CurPtr)));
+                         APInt(16, HexIntToVal(TokStart + 3, CurPtr)));
     return lltok::APFloat;
   }
 }
@@ -1049,7 +1116,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
       !isdigit(static_cast<unsigned char>(CurPtr[0]))) {
     // Okay, this is not a number after the -, it's probably a label.
     if (const char *End = isLabelTail(CurPtr)) {
-      StrVal.assign(TokStart, End-1);
+      StrVal.assign(TokStart, End - 1);
       CurPtr = End;
       return lltok::LabelStr;
     }
@@ -1076,7 +1143,7 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   // Check to see if this really is a string label, e.g. "-1:".
   if (isLabelChar(CurPtr[0]) || CurPtr[0] == ':') {
     if (const char *End = isLabelTail(CurPtr)) {
-      StrVal.assign(TokStart, End-1);
+      StrVal.assign(TokStart, End - 1);
       CurPtr = End;
       return lltok::LabelStr;
     }
@@ -1094,19 +1161,21 @@ lltok::Kind LLLexer::LexDigitOrNegative() {
   ++CurPtr;
 
   // Skip over [0-9]*([eE][-+]?[0-9]+)?
-  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+  while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+    ++CurPtr;
 
   if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
     if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
         ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
-          isdigit(static_cast<unsigned char>(CurPtr[2])))) {
+         isdigit(static_cast<unsigned char>(CurPtr[2])))) {
       CurPtr += 2;
-      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+      while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+        ++CurPtr;
     }
   }
 
-  APFloatVal = APFloat(APFloat::IEEEdouble(),
-                       StringRef(TokStart, CurPtr - TokStart));
+  APFloatVal =
+      APFloat(APFloat::IEEEdouble(), StringRef(TokStart, CurPtr - TokStart));
   return lltok::APFloat;
 }
 
@@ -1124,25 +1193,27 @@ lltok::Kind LLLexer::LexPositive() {
 
   // At this point, we need a '.'.
   if (CurPtr[0] != '.') {
-    CurPtr = TokStart+1;
+    CurPtr = TokStart + 1;
     return lltok::Error;
   }
 
   ++CurPtr;
 
   // Skip over [0-9]*([eE][-+]?[0-9]+)?
-  while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+  while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+    ++CurPtr;
 
   if (CurPtr[0] == 'e' || CurPtr[0] == 'E') {
     if (isdigit(static_cast<unsigned char>(CurPtr[1])) ||
         ((CurPtr[1] == '-' || CurPtr[1] == '+') &&
-        isdigit(static_cast<unsigned char>(CurPtr[2])))) {
+         isdigit(static_cast<unsigned char>(CurPtr[2])))) {
       CurPtr += 2;
-      while (isdigit(static_cast<unsigned char>(CurPtr[0]))) ++CurPtr;
+      while (isdigit(static_cast<unsigned char>(CurPtr[0])))
+        ++CurPtr;
     }
   }
 
-  APFloatVal = APFloat(APFloat::IEEEdouble(),
-                       StringRef(TokStart, CurPtr - TokStart));
+  APFloatVal =
+      APFloat(APFloat::IEEEdouble(), StringRef(TokStart, CurPtr - TokStart));
   return lltok::APFloat;
 }
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLLexer.h b/hpvm/llvm_patches/lib/AsmParser/LLLexer.h
index 4d3a2920e937475ece2c2878a7476ad30647d7c1..c37b0dbaf14a1a890b5911c53ea2f3a026f4ecc0 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLLexer.h
+++ b/hpvm/llvm_patches/lib/AsmParser/LLLexer.h
@@ -20,85 +20,81 @@
 #include <string>
 
 namespace llvm {
-  class MemoryBuffer;
-  class Type;
-  class SMDiagnostic;
-  class LLVMContext;
-
-  class LLLexer {
-    const char *CurPtr;
-    StringRef CurBuf;
-    SMDiagnostic &ErrorInfo;
-    SourceMgr &SM;
-    LLVMContext &Context;
-
-    // Information about the current token.
-    const char *TokStart;
-    lltok::Kind CurKind;
-    std::string StrVal;
-    unsigned UIntVal;
-    Type *TyVal;
-    APFloat APFloatVal;
-    APSInt  APSIntVal;
-
-    // When false (default), an identifier ending in ':' is a label token.
-    // When true, the ':' is treated as a separate token.
-    bool IgnoreColonInIdentifiers;
-
-  public:
-    explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &,
-                     LLVMContext &C);
-
-    lltok::Kind Lex() {
-      return CurKind = LexToken();
-    }
-
-    typedef SMLoc LocTy;
-    LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); }
-    lltok::Kind getKind() const { return CurKind; }
-    const std::string &getStrVal() const { return StrVal; }
-    Type *getTyVal() const { return TyVal; }
-    unsigned getUIntVal() const { return UIntVal; }
-    const APSInt &getAPSIntVal() const { return APSIntVal; }
-    const APFloat &getAPFloatVal() const { return APFloatVal; }
-
-    void setIgnoreColonInIdentifiers(bool val) {
-      IgnoreColonInIdentifiers = val;
-    }
-
-    bool Error(LocTy ErrorLoc, const Twine &Msg) const;
-    bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); }
-
-    void Warning(LocTy WarningLoc, const Twine &Msg) const;
-    void Warning(const Twine &Msg) const { return Warning(getLoc(), Msg); }
-
-  private:
-    lltok::Kind LexToken();
-
-    int getNextChar();
-    void SkipLineComment();
-    lltok::Kind ReadString(lltok::Kind kind);
-    bool ReadVarName();
-
-    lltok::Kind LexIdentifier();
-    lltok::Kind LexDigitOrNegative();
-    lltok::Kind LexPositive();
-    lltok::Kind LexAt();
-    lltok::Kind LexDollar();
-    lltok::Kind LexExclaim();
-    lltok::Kind LexPercent();
-    lltok::Kind LexUIntID(lltok::Kind Token);
-    lltok::Kind LexVar(lltok::Kind Var, lltok::Kind VarID);
-    lltok::Kind LexQuote();
-    lltok::Kind Lex0x();
-    lltok::Kind LexHash();
-    lltok::Kind LexCaret();
-
-    uint64_t atoull(const char *Buffer, const char *End);
-    uint64_t HexIntToVal(const char *Buffer, const char *End);
-    void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
-    void FP80HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
-  };
+class MemoryBuffer;
+class Type;
+class SMDiagnostic;
+class LLVMContext;
+
+class LLLexer {
+  const char *CurPtr;
+  StringRef CurBuf;
+  SMDiagnostic &ErrorInfo;
+  SourceMgr &SM;
+  LLVMContext &Context;
+
+  // Information about the current token.
+  const char *TokStart;
+  lltok::Kind CurKind;
+  std::string StrVal;
+  unsigned UIntVal;
+  Type *TyVal;
+  APFloat APFloatVal;
+  APSInt APSIntVal;
+
+  // When false (default), an identifier ending in ':' is a label token.
+  // When true, the ':' is treated as a separate token.
+  bool IgnoreColonInIdentifiers;
+
+public:
+  explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &,
+                   LLVMContext &C);
+
+  lltok::Kind Lex() { return CurKind = LexToken(); }
+
+  typedef SMLoc LocTy;
+  LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); }
+  lltok::Kind getKind() const { return CurKind; }
+  const std::string &getStrVal() const { return StrVal; }
+  Type *getTyVal() const { return TyVal; }
+  unsigned getUIntVal() const { return UIntVal; }
+  const APSInt &getAPSIntVal() const { return APSIntVal; }
+  const APFloat &getAPFloatVal() const { return APFloatVal; }
+
+  void setIgnoreColonInIdentifiers(bool val) { IgnoreColonInIdentifiers = val; }
+
+  bool Error(LocTy ErrorLoc, const Twine &Msg) const;
+  bool Error(const Twine &Msg) const { return Error(getLoc(), Msg); }
+
+  void Warning(LocTy WarningLoc, const Twine &Msg) const;
+  void Warning(const Twine &Msg) const { return Warning(getLoc(), Msg); }
+
+private:
+  lltok::Kind LexToken();
+
+  int getNextChar();
+  void SkipLineComment();
+  lltok::Kind ReadString(lltok::Kind kind);
+  bool ReadVarName();
+
+  lltok::Kind LexIdentifier();
+  lltok::Kind LexDigitOrNegative();
+  lltok::Kind LexPositive();
+  lltok::Kind LexAt();
+  lltok::Kind LexDollar();
+  lltok::Kind LexExclaim();
+  lltok::Kind LexPercent();
+  lltok::Kind LexUIntID(lltok::Kind Token);
+  lltok::Kind LexVar(lltok::Kind Var, lltok::Kind VarID);
+  lltok::Kind LexQuote();
+  lltok::Kind Lex0x();
+  lltok::Kind LexHash();
+  lltok::Kind LexCaret();
+
+  uint64_t atoull(const char *Buffer, const char *End);
+  uint64_t HexIntToVal(const char *Buffer, const char *End);
+  void HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
+  void FP80HexToIntPair(const char *Buffer, const char *End, uint64_t Pair[2]);
+};
 } // end namespace llvm
 
 #endif
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
index bee1fc89014a43dfa3e13925def21e8be1aad58c..f5ce44e2a920405f7e3790fcb1d9eb7fba28d636 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
+++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.cpp
@@ -174,7 +174,7 @@ bool LLParser::ValidateEndOfModule() {
     } else if (auto *GV = dyn_cast<GlobalVariable>(V)) {
       AttrBuilder Attrs(GV->getAttributes());
       Attrs.merge(B);
-      GV->setAttributes(AttributeSet::get(Context,Attrs));
+      GV->setAttributes(AttributeSet::get(Context, Attrs));
     } else {
       llvm_unreachable("invalid object with forward attribute group reference");
     }
@@ -191,8 +191,9 @@ bool LLParser::ValidateEndOfModule() {
       return Error(NT.second.second,
                    "use of undefined type '%" + Twine(NT.first) + "'");
 
-  for (StringMap<std::pair<Type*, LocTy> >::iterator I =
-       NamedTypes.begin(), E = NamedTypes.end(); I != E; ++I)
+  for (StringMap<std::pair<Type *, LocTy>>::iterator I = NamedTypes.begin(),
+                                                     E = NamedTypes.end();
+       I != E; ++I)
     if (I->second.second.isValid())
       return Error(I->second.second,
                    "use of undefined type named '" + I->getKey() + "'");
@@ -205,17 +206,17 @@ bool LLParser::ValidateEndOfModule() {
   if (!ForwardRefVals.empty())
     return Error(ForwardRefVals.begin()->second.second,
                  "use of undefined value '@" + ForwardRefVals.begin()->first +
-                 "'");
+                     "'");
 
   if (!ForwardRefValIDs.empty())
     return Error(ForwardRefValIDs.begin()->second.second,
                  "use of undefined value '@" +
-                 Twine(ForwardRefValIDs.begin()->first) + "'");
+                     Twine(ForwardRefValIDs.begin()->first) + "'");
 
   if (!ForwardRefMDNodes.empty())
     return Error(ForwardRefMDNodes.begin()->second.second,
                  "use of undefined metadata '!" +
-                 Twine(ForwardRefMDNodes.begin()->first) + "'");
+                     Twine(ForwardRefMDNodes.begin()->first) + "'");
 
   // Resolve metadata cycles.
   for (auto &N : NumberedMetadata) {
@@ -232,13 +233,13 @@ bool LLParser::ValidateEndOfModule() {
   }
 
   // Look for intrinsic functions and CallInst that need to be upgraded
-  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE;)
     UpgradeCallsToIntrinsic(&*FI++); // must be post-increment, as we remove
 
   // Some types could be renamed during loading if several modules are
   // loaded in the same LLVMContext (LTO scenario). In this case we should
   // remangle intrinsics names as well.
-  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ) {
+  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE;) {
     Function *F = &*FI++;
     if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F)) {
       F->replaceAllUsesWith(Remangled.getValue());
@@ -317,30 +318,74 @@ bool LLParser::ParseTopLevelEntities() {
   }
   while (true) {
     switch (Lex.getKind()) {
-    default:         return TokError("expected top-level entity");
-    case lltok::Eof: return false;
-    case lltok::kw_declare: if (ParseDeclare()) return true; break;
-    case lltok::kw_define:  if (ParseDefine()) return true; break;
-    case lltok::kw_module:  if (ParseModuleAsm()) return true; break;
-    case lltok::kw_target:  if (ParseTargetDefinition()) return true; break;
+    default:
+      return TokError("expected top-level entity");
+    case lltok::Eof:
+      return false;
+    case lltok::kw_declare:
+      if (ParseDeclare())
+        return true;
+      break;
+    case lltok::kw_define:
+      if (ParseDefine())
+        return true;
+      break;
+    case lltok::kw_module:
+      if (ParseModuleAsm())
+        return true;
+      break;
+    case lltok::kw_target:
+      if (ParseTargetDefinition())
+        return true;
+      break;
     case lltok::kw_source_filename:
       if (ParseSourceFileName())
         return true;
       break;
-    case lltok::kw_deplibs: if (ParseDepLibs()) return true; break;
-    case lltok::LocalVarID: if (ParseUnnamedType()) return true; break;
-    case lltok::LocalVar:   if (ParseNamedType()) return true; break;
-    case lltok::GlobalID:   if (ParseUnnamedGlobal()) return true; break;
-    case lltok::GlobalVar:  if (ParseNamedGlobal()) return true; break;
-    case lltok::ComdatVar:  if (parseComdat()) return true; break;
-    case lltok::exclaim:    if (ParseStandaloneMetadata()) return true; break;
+    case lltok::kw_deplibs:
+      if (ParseDepLibs())
+        return true;
+      break;
+    case lltok::LocalVarID:
+      if (ParseUnnamedType())
+        return true;
+      break;
+    case lltok::LocalVar:
+      if (ParseNamedType())
+        return true;
+      break;
+    case lltok::GlobalID:
+      if (ParseUnnamedGlobal())
+        return true;
+      break;
+    case lltok::GlobalVar:
+      if (ParseNamedGlobal())
+        return true;
+      break;
+    case lltok::ComdatVar:
+      if (parseComdat())
+        return true;
+      break;
+    case lltok::exclaim:
+      if (ParseStandaloneMetadata())
+        return true;
+      break;
     case lltok::SummaryID:
       if (ParseSummaryEntry())
         return true;
       break;
-    case lltok::MetadataVar:if (ParseNamedMetadata()) return true; break;
-    case lltok::kw_attributes: if (ParseUnnamedAttrGrp()) return true; break;
-    case lltok::kw_uselistorder: if (ParseUseListOrder()) return true; break;
+    case lltok::MetadataVar:
+      if (ParseNamedMetadata())
+        return true;
+      break;
+    case lltok::kw_attributes:
+      if (ParseUnnamedAttrGrp())
+        return true;
+      break;
+    case lltok::kw_uselistorder:
+      if (ParseUseListOrder())
+        return true;
+      break;
     case lltok::kw_uselistorder_bb:
       if (ParseUseListOrderBB())
         return true;
@@ -357,7 +402,8 @@ bool LLParser::ParseModuleAsm() {
 
   std::string AsmStr;
   if (ParseToken(lltok::kw_asm, "expected 'module asm'") ||
-      ParseStringConstant(AsmStr)) return true;
+      ParseStringConstant(AsmStr))
+    return true;
 
   M->appendModuleInlineAsm(AsmStr);
   return false;
@@ -370,7 +416,8 @@ bool LLParser::ParseTargetDefinition() {
   assert(Lex.getKind() == lltok::kw_target);
   std::string Str;
   switch (Lex.Lex()) {
-  default: return TokError("unknown target property");
+  default:
+    return TokError("unknown target property");
   case lltok::kw_triple:
     Lex.Lex();
     if (ParseToken(lltok::equal, "expected '=' after target triple") ||
@@ -418,7 +465,8 @@ bool LLParser::ParseDepLibs() {
 
   do {
     std::string Str;
-    if (ParseStringConstant(Str)) return true;
+    if (ParseStringConstant(Str))
+      return true;
   } while (EatIfPresent(lltok::comma));
 
   return ParseToken(lltok::rsquare, "expected ']' at end of list");
@@ -436,11 +484,11 @@ bool LLParser::ParseUnnamedType() {
     return true;
 
   Type *Result = nullptr;
-  if (ParseStructDefinition(TypeLoc, "",
-                            NumberedTypes[TypeID], Result)) return true;
+  if (ParseStructDefinition(TypeLoc, "", NumberedTypes[TypeID], Result))
+    return true;
 
   if (!isa<StructType>(Result)) {
-    std::pair<Type*, LocTy> &Entry = NumberedTypes[TypeID];
+    std::pair<Type *, LocTy> &Entry = NumberedTypes[TypeID];
     if (Entry.first)
       return Error(TypeLoc, "non-struct types may not be recursive");
     Entry.first = Result;
@@ -455,18 +503,18 @@ bool LLParser::ParseUnnamedType() {
 bool LLParser::ParseNamedType() {
   std::string Name = Lex.getStrVal();
   LocTy NameLoc = Lex.getLoc();
-  Lex.Lex();  // eat LocalVar.
+  Lex.Lex(); // eat LocalVar.
 
   if (ParseToken(lltok::equal, "expected '=' after name") ||
       ParseToken(lltok::kw_type, "expected 'type' after name"))
     return true;
 
   Type *Result = nullptr;
-  if (ParseStructDefinition(NameLoc, Name,
-                            NamedTypes[Name], Result)) return true;
+  if (ParseStructDefinition(NameLoc, Name, NamedTypes[Name], Result))
+    return true;
 
   if (!isa<StructType>(Result)) {
-    std::pair<Type*, LocTy> &Entry = NamedTypes[Name];
+    std::pair<Type *, LocTy> &Entry = NamedTypes[Name];
     if (Entry.first)
       return Error(NameLoc, "non-struct types may not be recursive");
     Entry.first = Result;
@@ -506,8 +554,7 @@ bool LLParser::ParseDefine() {
   Lex.Lex();
 
   Function *F;
-  return ParseFunctionHeader(F, true) ||
-         ParseOptionalFunctionMetadata(*F) ||
+  return ParseFunctionHeader(F, true) || ParseOptionalFunctionMetadata(*F) ||
          ParseFunctionBody(*F);
 }
 
@@ -544,7 +591,8 @@ bool LLParser::ParseOptionalUnnamedAddr(
 ///   OptionalDLLStorageClass
 ///                                                     ...   -> global variable
 ///   GlobalID '=' OptionalVisibility (ALIAS | IFUNC) ...
-///   GlobalID '=' OptionalLinkage OptionalPreemptionSpecifier OptionalVisibility
+///   GlobalID '=' OptionalLinkage OptionalPreemptionSpecifier
+///   OptionalVisibility
 ///                OptionalDLLStorageClass
 ///                                                     ...   -> global variable
 bool LLParser::ParseUnnamedGlobal() {
@@ -555,8 +603,8 @@ bool LLParser::ParseUnnamedGlobal() {
   // Handle the GlobalID form.
   if (Lex.getKind() == lltok::GlobalID) {
     if (Lex.getUIntVal() != VarID)
-      return Error(Lex.getLoc(), "variable expected to be numbered '%" +
-                   Twine(VarID) + "'");
+      return Error(Lex.getLoc(),
+                   "variable expected to be numbered '%" + Twine(VarID) + "'");
     Lex.Lex(); // eat GlobalID;
 
     if (ParseToken(lltok::equal, "expected '=' after name"))
@@ -665,7 +713,8 @@ bool LLParser::parseComdat() {
 //   ::= '!' STRINGCONSTANT
 bool LLParser::ParseMDString(MDString *&Result) {
   std::string Str;
-  if (ParseStringConstant(Str)) return true;
+  if (ParseStringConstant(Str))
+    return true;
   Result = MDString::get(Context, Str);
   return false;
 }
@@ -735,8 +784,7 @@ bool LLParser::ParseStandaloneMetadata() {
   unsigned MetadataID = 0;
 
   MDNode *Init;
-  if (ParseUInt32(MetadataID) ||
-      ParseToken(lltok::equal, "expected '=' here"))
+  if (ParseUInt32(MetadataID) || ParseToken(lltok::equal, "expected '=' here"))
     return true;
 
   // Detect common error, from old metadata syntax.
@@ -883,9 +931,9 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
     llvm_unreachable("Not an alias or ifunc!");
   Lex.Lex();
 
-  GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L;
+  GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes)L;
 
-  if(IsAlias && !GlobalAlias::isValidLinkage(Linkage))
+  if (IsAlias && !GlobalAlias::isValidLinkage(Linkage))
     return Error(NameLoc, "invalid linkage type for alias");
 
   if (!isValidVisibilityForLinkage(Visibility, L))
@@ -923,14 +971,12 @@ bool LLParser::parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
   unsigned AddrSpace = PTy->getAddressSpace();
 
   if (IsAlias && Ty != PTy->getElementType())
-    return Error(
-        ExplicitTypeLoc,
-        "explicit pointee type doesn't match operand's pointee type");
+    return Error(ExplicitTypeLoc,
+                 "explicit pointee type doesn't match operand's pointee type");
 
   if (!IsAlias && !PTy->getElementType()->isFunctionTy())
-    return Error(
-        ExplicitTypeLoc,
-        "explicit pointee type should be a function type");
+    return Error(ExplicitTypeLoc,
+                 "explicit pointee type should be a function type");
 
   GlobalValue *GVal = nullptr;
 
@@ -1042,16 +1088,14 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
       ParseOptionalToken(lltok::kw_externally_initialized,
                          IsExternallyInitialized,
                          &IsExternallyInitializedLoc) ||
-      ParseGlobalType(IsConstant) ||
-      ParseType(Ty, TyLoc))
+      ParseGlobalType(IsConstant) || ParseType(Ty, TyLoc))
     return true;
 
   // If the linkage is specified and is external, then no initializer is
   // present.
   Constant *Init = nullptr;
-  if (!HasLinkage ||
-      !GlobalValue::isValidDeclarationLinkage(
-          (GlobalValue::LinkageTypes)Linkage)) {
+  if (!HasLinkage || !GlobalValue::isValidDeclarationLinkage(
+                         (GlobalValue::LinkageTypes)Linkage)) {
     if (ParseGlobalValue(Ty, Init))
       return true;
   }
@@ -1078,13 +1122,14 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
 
   GlobalVariable *GV;
   if (!GVal) {
-    GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, nullptr,
-                            Name, nullptr, GlobalVariable::NotThreadLocal,
-                            AddrSpace);
+    GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage,
+                            nullptr, Name, nullptr,
+                            GlobalVariable::NotThreadLocal, AddrSpace);
   } else {
     if (GVal->getValueType() != Ty)
-      return Error(TyLoc,
-            "forward reference and definition of global have different types");
+      return Error(
+          TyLoc,
+          "forward reference and definition of global have different types");
 
     GV = cast<GlobalVariable>(GVal);
 
@@ -1123,7 +1168,8 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc,
         return true;
     } else if (Lex.getKind() == lltok::kw_align) {
       unsigned Alignment;
-      if (ParseOptionalAlignment(Alignment)) return true;
+      if (ParseOptionalAlignment(Alignment))
+        return true;
       GV->setAlignment(Alignment);
     } else if (Lex.getKind() == lltok::MetadataVar) {
       if (ParseGlobalObjectMetadataAttachment(*GV))
@@ -1195,7 +1241,8 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       BuiltinLoc = Lex.getLoc();
     switch (Token) {
     default:
-      if (!inAttrGrp) return HaveError;
+      if (!inAttrGrp)
+        return HaveError;
       return Error(Lex.getLoc(), "unterminated attribute group");
     case lltok::rbrace:
       // Finished.
@@ -1206,12 +1253,13 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       //
       //   define void @foo() #1 { ... }
       if (inAttrGrp)
-        HaveError |=
-          Error(Lex.getLoc(),
-              "cannot have an attribute group reference in an attribute group");
+        HaveError |= Error(
+            Lex.getLoc(),
+            "cannot have an attribute group reference in an attribute group");
 
       unsigned AttrGrpNum = Lex.getUIntVal();
-      if (inAttrGrp) break;
+      if (inAttrGrp)
+        break;
 
       // Save the reference to the attribute group. We'll fill it in later.
       FwdRefAttrGrps.push_back(AttrGrpNum);
@@ -1265,73 +1313,148 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
       continue;
     }
-    case lltok::kw_alwaysinline: B.addAttribute(Attribute::AlwaysInline); break;
-    case lltok::kw_argmemonly: B.addAttribute(Attribute::ArgMemOnly); break;
-    case lltok::kw_builtin: B.addAttribute(Attribute::Builtin); break;
-    case lltok::kw_cold: B.addAttribute(Attribute::Cold); break;
-    case lltok::kw_convergent: B.addAttribute(Attribute::Convergent); break;
+    case lltok::kw_alwaysinline:
+      B.addAttribute(Attribute::AlwaysInline);
+      break;
+    case lltok::kw_argmemonly:
+      B.addAttribute(Attribute::ArgMemOnly);
+      break;
+    case lltok::kw_builtin:
+      B.addAttribute(Attribute::Builtin);
+      break;
+    case lltok::kw_cold:
+      B.addAttribute(Attribute::Cold);
+      break;
+    case lltok::kw_convergent:
+      B.addAttribute(Attribute::Convergent);
+      break;
     case lltok::kw_inaccessiblememonly:
-      B.addAttribute(Attribute::InaccessibleMemOnly); break;
+      B.addAttribute(Attribute::InaccessibleMemOnly);
+      break;
     case lltok::kw_inaccessiblemem_or_argmemonly:
-      B.addAttribute(Attribute::InaccessibleMemOrArgMemOnly); break;
-    case lltok::kw_inlinehint: B.addAttribute(Attribute::InlineHint); break;
-    case lltok::kw_jumptable: B.addAttribute(Attribute::JumpTable); break;
-    case lltok::kw_minsize: B.addAttribute(Attribute::MinSize); break;
-    case lltok::kw_naked: B.addAttribute(Attribute::Naked); break;
-    case lltok::kw_nobuiltin: B.addAttribute(Attribute::NoBuiltin); break;
-    case lltok::kw_noduplicate: B.addAttribute(Attribute::NoDuplicate); break;
-    case lltok::kw_nofree: B.addAttribute(Attribute::NoFree); break;
+      B.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
+      break;
+    case lltok::kw_inlinehint:
+      B.addAttribute(Attribute::InlineHint);
+      break;
+    case lltok::kw_jumptable:
+      B.addAttribute(Attribute::JumpTable);
+      break;
+    case lltok::kw_minsize:
+      B.addAttribute(Attribute::MinSize);
+      break;
+    case lltok::kw_naked:
+      B.addAttribute(Attribute::Naked);
+      break;
+    case lltok::kw_nobuiltin:
+      B.addAttribute(Attribute::NoBuiltin);
+      break;
+    case lltok::kw_noduplicate:
+      B.addAttribute(Attribute::NoDuplicate);
+      break;
+    case lltok::kw_nofree:
+      B.addAttribute(Attribute::NoFree);
+      break;
     case lltok::kw_noimplicitfloat:
-      B.addAttribute(Attribute::NoImplicitFloat); break;
-    case lltok::kw_noinline: B.addAttribute(Attribute::NoInline); break;
-    case lltok::kw_nonlazybind: B.addAttribute(Attribute::NonLazyBind); break;
-    case lltok::kw_noredzone: B.addAttribute(Attribute::NoRedZone); break;
-    case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break;
-    case lltok::kw_nosync: B.addAttribute(Attribute::NoSync); break;
-    case lltok::kw_nocf_check: B.addAttribute(Attribute::NoCfCheck); break;
-    case lltok::kw_norecurse: B.addAttribute(Attribute::NoRecurse); break;
-    case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break;
+      B.addAttribute(Attribute::NoImplicitFloat);
+      break;
+    case lltok::kw_noinline:
+      B.addAttribute(Attribute::NoInline);
+      break;
+    case lltok::kw_nonlazybind:
+      B.addAttribute(Attribute::NonLazyBind);
+      break;
+    case lltok::kw_noredzone:
+      B.addAttribute(Attribute::NoRedZone);
+      break;
+    case lltok::kw_noreturn:
+      B.addAttribute(Attribute::NoReturn);
+      break;
+    case lltok::kw_nosync:
+      B.addAttribute(Attribute::NoSync);
+      break;
+    case lltok::kw_nocf_check:
+      B.addAttribute(Attribute::NoCfCheck);
+      break;
+    case lltok::kw_norecurse:
+      B.addAttribute(Attribute::NoRecurse);
+      break;
+    case lltok::kw_nounwind:
+      B.addAttribute(Attribute::NoUnwind);
+      break;
     case lltok::kw_optforfuzzing:
-      B.addAttribute(Attribute::OptForFuzzing); break;
-    case lltok::kw_optnone: B.addAttribute(Attribute::OptimizeNone); break;
-    case lltok::kw_optsize: B.addAttribute(Attribute::OptimizeForSize); break;
-    case lltok::kw_readnone: B.addAttribute(Attribute::ReadNone); break;
-    case lltok::kw_readonly: B.addAttribute(Attribute::ReadOnly); break;
+      B.addAttribute(Attribute::OptForFuzzing);
+      break;
+    case lltok::kw_optnone:
+      B.addAttribute(Attribute::OptimizeNone);
+      break;
+    case lltok::kw_optsize:
+      B.addAttribute(Attribute::OptimizeForSize);
+      break;
+    case lltok::kw_readnone:
+      B.addAttribute(Attribute::ReadNone);
+      break;
+    case lltok::kw_readonly:
+      B.addAttribute(Attribute::ReadOnly);
+      break;
     case lltok::kw_returns_twice:
-      B.addAttribute(Attribute::ReturnsTwice); break;
-    case lltok::kw_speculatable: B.addAttribute(Attribute::Speculatable); break;
-    case lltok::kw_ssp: B.addAttribute(Attribute::StackProtect); break;
-    case lltok::kw_sspreq: B.addAttribute(Attribute::StackProtectReq); break;
+      B.addAttribute(Attribute::ReturnsTwice);
+      break;
+    case lltok::kw_speculatable:
+      B.addAttribute(Attribute::Speculatable);
+      break;
+    case lltok::kw_ssp:
+      B.addAttribute(Attribute::StackProtect);
+      break;
+    case lltok::kw_sspreq:
+      B.addAttribute(Attribute::StackProtectReq);
+      break;
     case lltok::kw_sspstrong:
-      B.addAttribute(Attribute::StackProtectStrong); break;
-    case lltok::kw_safestack: B.addAttribute(Attribute::SafeStack); break;
+      B.addAttribute(Attribute::StackProtectStrong);
+      break;
+    case lltok::kw_safestack:
+      B.addAttribute(Attribute::SafeStack);
+      break;
     case lltok::kw_shadowcallstack:
-      B.addAttribute(Attribute::ShadowCallStack); break;
+      B.addAttribute(Attribute::ShadowCallStack);
+      break;
     case lltok::kw_sanitize_address:
-      B.addAttribute(Attribute::SanitizeAddress); break;
+      B.addAttribute(Attribute::SanitizeAddress);
+      break;
     case lltok::kw_sanitize_hwaddress:
-      B.addAttribute(Attribute::SanitizeHWAddress); break;
+      B.addAttribute(Attribute::SanitizeHWAddress);
+      break;
     case lltok::kw_sanitize_memtag:
-      B.addAttribute(Attribute::SanitizeMemTag); break;
+      B.addAttribute(Attribute::SanitizeMemTag);
+      break;
     case lltok::kw_sanitize_thread:
-      B.addAttribute(Attribute::SanitizeThread); break;
+      B.addAttribute(Attribute::SanitizeThread);
+      break;
     case lltok::kw_sanitize_memory:
-      B.addAttribute(Attribute::SanitizeMemory); break;
+      B.addAttribute(Attribute::SanitizeMemory);
+      break;
     case lltok::kw_speculative_load_hardening:
       B.addAttribute(Attribute::SpeculativeLoadHardening);
       break;
-    case lltok::kw_strictfp: B.addAttribute(Attribute::StrictFP); break;
-    case lltok::kw_uwtable: B.addAttribute(Attribute::UWTable); break;
-    case lltok::kw_willreturn: B.addAttribute(Attribute::WillReturn); break;
-    case lltok::kw_writeonly: B.addAttribute(Attribute::WriteOnly); break;
+    case lltok::kw_strictfp:
+      B.addAttribute(Attribute::StrictFP);
+      break;
+    case lltok::kw_uwtable:
+      B.addAttribute(Attribute::UWTable);
+      break;
+    case lltok::kw_willreturn:
+      B.addAttribute(Attribute::WillReturn);
+      break;
+    case lltok::kw_writeonly:
+      B.addAttribute(Attribute::WriteOnly);
+      break;
 
     // Error handling.
     case lltok::kw_inreg:
     case lltok::kw_signext:
     case lltok::kw_zeroext:
       HaveError |=
-        Error(Lex.getLoc(),
-              "invalid use of attribute on a function");
+          Error(Lex.getLoc(), "invalid use of attribute on a function");
       break;
     case lltok::kw_byval:
     case lltok::kw_dereferenceable:
@@ -1346,14 +1469,14 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_swifterror:
     case lltok::kw_swiftself:
     case lltok::kw_immarg:
-    
+
     // VISC Parameter only attributes
     case lltok::kw_in:
     case lltok::kw_out:
     case lltok::kw_inout:
       HaveError |=
-        Error(Lex.getLoc(),
-              "invalid use of parameter-only attribute on a function");
+          Error(Lex.getLoc(),
+                "invalid use of parameter-only attribute on a function");
       break;
     }
 
@@ -1412,7 +1535,7 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
 
   // Look this name up in the normal function symbol table.
   GlobalValue *Val =
-    cast_or_null<GlobalValue>(M->getValueSymbolTable().lookup(Name));
+      cast_or_null<GlobalValue>(M->getValueSymbolTable().lookup(Name));
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
@@ -1507,7 +1630,7 @@ bool LLParser::ParseStringConstant(std::string &Result) {
 bool LLParser::ParseUInt32(uint32_t &Val) {
   if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
     return TokError("expected integer");
-  uint64_t Val64 = Lex.getAPSIntVal().getLimitedValue(0xFFFFFFFFULL+1);
+  uint64_t Val64 = Lex.getAPSIntVal().getLimitedValue(0xFFFFFFFFULL + 1);
   if (Val64 != unsigned(Val64))
     return TokError("expected 32-bit integer (too large)");
   Val = Val64;
@@ -1531,17 +1654,17 @@ bool LLParser::ParseUInt64(uint64_t &Val) {
 ///   := 'localexec'
 bool LLParser::ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM) {
   switch (Lex.getKind()) {
-    default:
-      return TokError("expected localdynamic, initialexec or localexec");
-    case lltok::kw_localdynamic:
-      TLM = GlobalVariable::LocalDynamicTLSModel;
-      break;
-    case lltok::kw_initialexec:
-      TLM = GlobalVariable::InitialExecTLSModel;
-      break;
-    case lltok::kw_localexec:
-      TLM = GlobalVariable::LocalExecTLSModel;
-      break;
+  default:
+    return TokError("expected localdynamic, initialexec or localexec");
+  case lltok::kw_localdynamic:
+    TLM = GlobalVariable::LocalDynamicTLSModel;
+    break;
+  case lltok::kw_initialexec:
+    TLM = GlobalVariable::InitialExecTLSModel;
+    break;
+  case lltok::kw_localexec:
+    TLM = GlobalVariable::LocalExecTLSModel;
+    break;
   }
 
   Lex.Lex();
@@ -1561,7 +1684,7 @@ bool LLParser::ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM) {
   if (Lex.getKind() == lltok::lparen) {
     Lex.Lex();
     return ParseTLSModel(TLM) ||
-      ParseToken(lltok::rparen, "expected ')' after thread local model");
+           ParseToken(lltok::rparen, "expected ')' after thread local model");
   }
   return false;
 }
@@ -1591,7 +1714,8 @@ bool LLParser::ParseStringAttribute(AttrBuilder &B) {
   return false;
 }
 
-/// ParseOptionalParamAttrs - Parse a potentially empty list of parameter attributes.
+/// ParseOptionalParamAttrs - Parse a potentially empty list of parameter
+/// attributes.
 bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
   bool HaveError = false;
 
@@ -1600,7 +1724,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
   while (true) {
     lltok::Kind Token = Lex.getKind();
     switch (Token) {
-    default:  // End of attributes.
+    default: // End of attributes.
       return HaveError;
     case lltok::StringConstant: {
       if (ParseStringAttribute(B))
@@ -1635,27 +1759,65 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
       B.addDereferenceableOrNullAttr(Bytes);
       continue;
     }
-    case lltok::kw_inalloca:        B.addAttribute(Attribute::InAlloca); break;
-    case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
-    case lltok::kw_nest:            B.addAttribute(Attribute::Nest); break;
-    case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
-    case lltok::kw_nocapture:       B.addAttribute(Attribute::NoCapture); break;
-    case lltok::kw_nonnull:         B.addAttribute(Attribute::NonNull); break;
-    case lltok::kw_readnone:        B.addAttribute(Attribute::ReadNone); break;
-    case lltok::kw_readonly:        B.addAttribute(Attribute::ReadOnly); break;
-    case lltok::kw_returned:        B.addAttribute(Attribute::Returned); break;
-    case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
-    case lltok::kw_sret:            B.addAttribute(Attribute::StructRet); break;
-    case lltok::kw_swifterror:      B.addAttribute(Attribute::SwiftError); break;
-    case lltok::kw_swiftself:       B.addAttribute(Attribute::SwiftSelf); break;
-    case lltok::kw_writeonly:       B.addAttribute(Attribute::WriteOnly); break;
-    case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
-    case lltok::kw_immarg:          B.addAttribute(Attribute::ImmArg); break;
+    case lltok::kw_inalloca:
+      B.addAttribute(Attribute::InAlloca);
+      break;
+    case lltok::kw_inreg:
+      B.addAttribute(Attribute::InReg);
+      break;
+    case lltok::kw_nest:
+      B.addAttribute(Attribute::Nest);
+      break;
+    case lltok::kw_noalias:
+      B.addAttribute(Attribute::NoAlias);
+      break;
+    case lltok::kw_nocapture:
+      B.addAttribute(Attribute::NoCapture);
+      break;
+    case lltok::kw_nonnull:
+      B.addAttribute(Attribute::NonNull);
+      break;
+    case lltok::kw_readnone:
+      B.addAttribute(Attribute::ReadNone);
+      break;
+    case lltok::kw_readonly:
+      B.addAttribute(Attribute::ReadOnly);
+      break;
+    case lltok::kw_returned:
+      B.addAttribute(Attribute::Returned);
+      break;
+    case lltok::kw_signext:
+      B.addAttribute(Attribute::SExt);
+      break;
+    case lltok::kw_sret:
+      B.addAttribute(Attribute::StructRet);
+      break;
+    case lltok::kw_swifterror:
+      B.addAttribute(Attribute::SwiftError);
+      break;
+    case lltok::kw_swiftself:
+      B.addAttribute(Attribute::SwiftSelf);
+      break;
+    case lltok::kw_writeonly:
+      B.addAttribute(Attribute::WriteOnly);
+      break;
+    case lltok::kw_zeroext:
+      B.addAttribute(Attribute::ZExt);
+      break;
+    case lltok::kw_immarg:
+      B.addAttribute(Attribute::ImmArg);
+      break;
 
     // VISC parameter attributes
-    case lltok::kw_in:              B.addAttribute(Attribute::In); break;
-    case lltok::kw_out:             B.addAttribute(Attribute::Out); break;
-    case lltok::kw_inout:           B.addAttribute(Attribute::InOut); break;
+    case lltok::kw_in:
+      B.addAttribute(Attribute::In);
+      break;
+    case lltok::kw_out:
+      B.addAttribute(Attribute::Out);
+      break;
+    case lltok::kw_inout:
+      B.addAttribute(Attribute::InOut);
+      break;
 
     case lltok::kw_alignstack:
     case lltok::kw_alwaysinline:
@@ -1691,7 +1853,8 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_shadowcallstack:
     case lltok::kw_strictfp:
     case lltok::kw_uwtable:
-      HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
+      HaveError |=
+          Error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
     }
 
@@ -1699,7 +1862,8 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
   }
 }
 
-/// ParseOptionalReturnAttrs - Parse a potentially empty list of return attributes.
+/// ParseOptionalReturnAttrs - Parse a potentially empty list of return
+/// attributes.
 bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
   bool HaveError = false;
 
@@ -1708,7 +1872,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
   while (true) {
     lltok::Kind Token = Lex.getKind();
     switch (Token) {
-    default:  // End of attributes.
+    default: // End of attributes.
       return HaveError;
     case lltok::StringConstant: {
       if (ParseStringAttribute(B))
@@ -1736,11 +1900,21 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
       B.addAlignmentAttr(Alignment);
       continue;
     }
-    case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
-    case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
-    case lltok::kw_nonnull:         B.addAttribute(Attribute::NonNull); break;
-    case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
-    case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
+    case lltok::kw_inreg:
+      B.addAttribute(Attribute::InReg);
+      break;
+    case lltok::kw_noalias:
+      B.addAttribute(Attribute::NoAlias);
+      break;
+    case lltok::kw_nonnull:
+      B.addAttribute(Attribute::NonNull);
+      break;
+    case lltok::kw_signext:
+      B.addAttribute(Attribute::SExt);
+      break;
+    case lltok::kw_zeroext:
+      B.addAttribute(Attribute::ZExt);
+      break;
 
     // Error handling.
     case lltok::kw_byval:
@@ -1757,7 +1931,8 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_in:
     case lltok::kw_out:
     case lltok::kw_inout:
-      HaveError |= Error(Lex.getLoc(), "invalid use of parameter-only attribute");
+      HaveError |=
+          Error(Lex.getLoc(), "invalid use of parameter-only attribute");
       break;
 
     case lltok::kw_alignstack:
@@ -1795,12 +1970,14 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_shadowcallstack:
     case lltok::kw_strictfp:
     case lltok::kw_uwtable:
-      HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
+      HaveError |=
+          Error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
 
     case lltok::kw_readnone:
     case lltok::kw_readonly:
-      HaveError |= Error(Lex.getLoc(), "invalid use of attribute on return type");
+      HaveError |=
+          Error(Lex.getLoc(), "invalid use of attribute on return type");
     }
 
     Lex.Lex();
@@ -1853,8 +2030,7 @@ static unsigned parseOptionalLinkageAux(lltok::Kind Kind, bool &HasLinkage) {
 ///   ::= 'external'
 bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage,
                                     unsigned &Visibility,
-                                    unsigned &DLLStorageClass,
-                                    bool &DSOLocal) {
+                                    unsigned &DLLStorageClass, bool &DSOLocal) {
   Res = parseOptionalLinkageAux(Lex.getKind(), HasLinkage);
   if (HasLinkage)
     Lex.Lex();
@@ -1974,51 +2150,133 @@ void LLParser::ParseOptionalDLLStorageClass(unsigned &Res) {
 ///
 bool LLParser::ParseOptionalCallingConv(unsigned &CC) {
   switch (Lex.getKind()) {
-  default:                       CC = CallingConv::C; return false;
-  case lltok::kw_ccc:            CC = CallingConv::C; break;
-  case lltok::kw_fastcc:         CC = CallingConv::Fast; break;
-  case lltok::kw_coldcc:         CC = CallingConv::Cold; break;
-  case lltok::kw_x86_stdcallcc:  CC = CallingConv::X86_StdCall; break;
-  case lltok::kw_x86_fastcallcc: CC = CallingConv::X86_FastCall; break;
-  case lltok::kw_x86_regcallcc:  CC = CallingConv::X86_RegCall; break;
-  case lltok::kw_x86_thiscallcc: CC = CallingConv::X86_ThisCall; break;
-  case lltok::kw_x86_vectorcallcc:CC = CallingConv::X86_VectorCall; break;
-  case lltok::kw_arm_apcscc:     CC = CallingConv::ARM_APCS; break;
-  case lltok::kw_arm_aapcscc:    CC = CallingConv::ARM_AAPCS; break;
-  case lltok::kw_arm_aapcs_vfpcc:CC = CallingConv::ARM_AAPCS_VFP; break;
-  case lltok::kw_aarch64_vector_pcs:CC = CallingConv::AArch64_VectorCall; break;
-  case lltok::kw_msp430_intrcc:  CC = CallingConv::MSP430_INTR; break;
-  case lltok::kw_avr_intrcc:     CC = CallingConv::AVR_INTR; break;
-  case lltok::kw_avr_signalcc:   CC = CallingConv::AVR_SIGNAL; break;
-  case lltok::kw_ptx_kernel:     CC = CallingConv::PTX_Kernel; break;
-  case lltok::kw_ptx_device:     CC = CallingConv::PTX_Device; break;
-  case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
-  case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
-  case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
-  case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
-  case lltok::kw_win64cc:        CC = CallingConv::Win64; break;
-  case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
-  case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
-  case lltok::kw_preserve_mostcc:CC = CallingConv::PreserveMost; break;
-  case lltok::kw_preserve_allcc: CC = CallingConv::PreserveAll; break;
-  case lltok::kw_ghccc:          CC = CallingConv::GHC; break;
-  case lltok::kw_swiftcc:        CC = CallingConv::Swift; break;
-  case lltok::kw_x86_intrcc:     CC = CallingConv::X86_INTR; break;
-  case lltok::kw_hhvmcc:         CC = CallingConv::HHVM; break;
-  case lltok::kw_hhvm_ccc:       CC = CallingConv::HHVM_C; break;
-  case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break;
-  case lltok::kw_amdgpu_vs:      CC = CallingConv::AMDGPU_VS; break;
-  case lltok::kw_amdgpu_ls:      CC = CallingConv::AMDGPU_LS; break;
-  case lltok::kw_amdgpu_hs:      CC = CallingConv::AMDGPU_HS; break;
-  case lltok::kw_amdgpu_es:      CC = CallingConv::AMDGPU_ES; break;
-  case lltok::kw_amdgpu_gs:      CC = CallingConv::AMDGPU_GS; break;
-  case lltok::kw_amdgpu_ps:      CC = CallingConv::AMDGPU_PS; break;
-  case lltok::kw_amdgpu_cs:      CC = CallingConv::AMDGPU_CS; break;
-  case lltok::kw_amdgpu_kernel:  CC = CallingConv::AMDGPU_KERNEL; break;
+  default:
+    CC = CallingConv::C;
+    return false;
+  case lltok::kw_ccc:
+    CC = CallingConv::C;
+    break;
+  case lltok::kw_fastcc:
+    CC = CallingConv::Fast;
+    break;
+  case lltok::kw_coldcc:
+    CC = CallingConv::Cold;
+    break;
+  case lltok::kw_x86_stdcallcc:
+    CC = CallingConv::X86_StdCall;
+    break;
+  case lltok::kw_x86_fastcallcc:
+    CC = CallingConv::X86_FastCall;
+    break;
+  case lltok::kw_x86_regcallcc:
+    CC = CallingConv::X86_RegCall;
+    break;
+  case lltok::kw_x86_thiscallcc:
+    CC = CallingConv::X86_ThisCall;
+    break;
+  case lltok::kw_x86_vectorcallcc:
+    CC = CallingConv::X86_VectorCall;
+    break;
+  case lltok::kw_arm_apcscc:
+    CC = CallingConv::ARM_APCS;
+    break;
+  case lltok::kw_arm_aapcscc:
+    CC = CallingConv::ARM_AAPCS;
+    break;
+  case lltok::kw_arm_aapcs_vfpcc:
+    CC = CallingConv::ARM_AAPCS_VFP;
+    break;
+  case lltok::kw_aarch64_vector_pcs:
+    CC = CallingConv::AArch64_VectorCall;
+    break;
+  case lltok::kw_msp430_intrcc:
+    CC = CallingConv::MSP430_INTR;
+    break;
+  case lltok::kw_avr_intrcc:
+    CC = CallingConv::AVR_INTR;
+    break;
+  case lltok::kw_avr_signalcc:
+    CC = CallingConv::AVR_SIGNAL;
+    break;
+  case lltok::kw_ptx_kernel:
+    CC = CallingConv::PTX_Kernel;
+    break;
+  case lltok::kw_ptx_device:
+    CC = CallingConv::PTX_Device;
+    break;
+  case lltok::kw_spir_kernel:
+    CC = CallingConv::SPIR_KERNEL;
+    break;
+  case lltok::kw_spir_func:
+    CC = CallingConv::SPIR_FUNC;
+    break;
+  case lltok::kw_intel_ocl_bicc:
+    CC = CallingConv::Intel_OCL_BI;
+    break;
+  case lltok::kw_x86_64_sysvcc:
+    CC = CallingConv::X86_64_SysV;
+    break;
+  case lltok::kw_win64cc:
+    CC = CallingConv::Win64;
+    break;
+  case lltok::kw_webkit_jscc:
+    CC = CallingConv::WebKit_JS;
+    break;
+  case lltok::kw_anyregcc:
+    CC = CallingConv::AnyReg;
+    break;
+  case lltok::kw_preserve_mostcc:
+    CC = CallingConv::PreserveMost;
+    break;
+  case lltok::kw_preserve_allcc:
+    CC = CallingConv::PreserveAll;
+    break;
+  case lltok::kw_ghccc:
+    CC = CallingConv::GHC;
+    break;
+  case lltok::kw_swiftcc:
+    CC = CallingConv::Swift;
+    break;
+  case lltok::kw_x86_intrcc:
+    CC = CallingConv::X86_INTR;
+    break;
+  case lltok::kw_hhvmcc:
+    CC = CallingConv::HHVM;
+    break;
+  case lltok::kw_hhvm_ccc:
+    CC = CallingConv::HHVM_C;
+    break;
+  case lltok::kw_cxx_fast_tlscc:
+    CC = CallingConv::CXX_FAST_TLS;
+    break;
+  case lltok::kw_amdgpu_vs:
+    CC = CallingConv::AMDGPU_VS;
+    break;
+  case lltok::kw_amdgpu_ls:
+    CC = CallingConv::AMDGPU_LS;
+    break;
+  case lltok::kw_amdgpu_hs:
+    CC = CallingConv::AMDGPU_HS;
+    break;
+  case lltok::kw_amdgpu_es:
+    CC = CallingConv::AMDGPU_ES;
+    break;
+  case lltok::kw_amdgpu_gs:
+    CC = CallingConv::AMDGPU_GS;
+    break;
+  case lltok::kw_amdgpu_ps:
+    CC = CallingConv::AMDGPU_PS;
+    break;
+  case lltok::kw_amdgpu_cs:
+    CC = CallingConv::AMDGPU_CS;
+    break;
+  case lltok::kw_amdgpu_kernel:
+    CC = CallingConv::AMDGPU_KERNEL;
+    break;
   case lltok::kw_cc: {
-      Lex.Lex();
-      return ParseUInt32(CC);
-    }
+    Lex.Lex();
+    return ParseUInt32(CC);
+  }
   }
 
   Lex.Lex();
@@ -2087,7 +2345,8 @@ bool LLParser::ParseOptionalAlignment(unsigned &Alignment) {
   if (!EatIfPresent(lltok::kw_align))
     return false;
   LocTy AlignLoc = Lex.getLoc();
-  if (ParseUInt32(Alignment)) return true;
+  if (ParseUInt32(Alignment))
+    return true;
   if (!isPowerOf2_32(Alignment))
     return Error(AlignLoc, "alignment is not a power of two");
   if (Alignment > Value::MaximumAlignment)
@@ -2113,7 +2372,8 @@ bool LLParser::ParseOptionalDerefAttrBytes(lltok::Kind AttrKind,
   if (!EatIfPresent(lltok::lparen))
     return Error(ParenLoc, "expected '('");
   LocTy DerefLoc = Lex.getLoc();
-  if (ParseUInt64(Bytes)) return true;
+  if (ParseUInt64(Bytes))
+    return true;
   ParenLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::rparen))
     return Error(ParenLoc, "expected ')'");
@@ -2141,7 +2401,8 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
     if (Lex.getKind() != lltok::kw_align)
       return Error(Lex.getLoc(), "expected metadata or 'align'");
 
-    if (ParseOptionalAlignment(Alignment)) return true;
+    if (ParseOptionalAlignment(Alignment))
+      return true;
   }
 
   return false;
@@ -2153,8 +2414,7 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
 ///
 /// This returns with AteExtraComma set to true if it ate an excess comma at the
 /// end.
-bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace,
-                                           LocTy &Loc,
+bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
                                            bool &AteExtraComma) {
   AteExtraComma = false;
   while (EatIfPresent(lltok::comma)) {
@@ -2249,14 +2509,25 @@ bool LLParser::ParseScope(SyncScope::ID &SSID) {
 /// This sets Ordering to the parsed value.
 bool LLParser::ParseOrdering(AtomicOrdering &Ordering) {
   switch (Lex.getKind()) {
-  default: return TokError("Expected ordering on atomic instruction");
-  case lltok::kw_unordered: Ordering = AtomicOrdering::Unordered; break;
-  case lltok::kw_monotonic: Ordering = AtomicOrdering::Monotonic; break;
+  default:
+    return TokError("Expected ordering on atomic instruction");
+  case lltok::kw_unordered:
+    Ordering = AtomicOrdering::Unordered;
+    break;
+  case lltok::kw_monotonic:
+    Ordering = AtomicOrdering::Monotonic;
+    break;
   // Not specified yet:
   // case lltok::kw_consume: Ordering = AtomicOrdering::Consume; break;
-  case lltok::kw_acquire: Ordering = AtomicOrdering::Acquire; break;
-  case lltok::kw_release: Ordering = AtomicOrdering::Release; break;
-  case lltok::kw_acq_rel: Ordering = AtomicOrdering::AcquireRelease; break;
+  case lltok::kw_acquire:
+    Ordering = AtomicOrdering::Acquire;
+    break;
+  case lltok::kw_release:
+    Ordering = AtomicOrdering::Release;
+    break;
+  case lltok::kw_acq_rel:
+    Ordering = AtomicOrdering::AcquireRelease;
+    break;
   case lltok::kw_seq_cst:
     Ordering = AtomicOrdering::SequentiallyConsistent;
     break;
@@ -2276,7 +2547,8 @@ bool LLParser::ParseOptionalStackAlignment(unsigned &Alignment) {
   if (!EatIfPresent(lltok::lparen))
     return Error(ParenLoc, "expected '('");
   LocTy AlignLoc = Lex.getLoc();
-  if (ParseUInt32(Alignment)) return true;
+  if (ParseUInt32(Alignment))
+    return true;
   ParenLoc = Lex.getLoc();
   if (!EatIfPresent(lltok::rparen))
     return Error(ParenLoc, "expected ')'");
@@ -2303,12 +2575,14 @@ bool LLParser::ParseIndexList(SmallVectorImpl<unsigned> &Indices,
 
   while (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::MetadataVar) {
-      if (Indices.empty()) return TokError("expected index");
+      if (Indices.empty())
+        return TokError("expected index");
       AteExtraComma = true;
       return false;
     }
     unsigned Idx = 0;
-    if (ParseUInt32(Idx)) return true;
+    if (ParseUInt32(Idx))
+      return true;
     Indices.push_back(Idx);
   }
 
@@ -2353,7 +2627,7 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
     break;
   case lltok::LocalVar: {
     // Type ::= %foo
-    std::pair<Type*, LocTy> &Entry = NamedTypes[Lex.getStrVal()];
+    std::pair<Type *, LocTy> &Entry = NamedTypes[Lex.getStrVal()];
 
     // If the type hasn't been defined yet, create a forward definition and
     // remember where that forward def'n was seen (in case it never is defined).
@@ -2368,7 +2642,7 @@ bool LLParser::ParseType(Type *&Result, const Twine &Msg, bool AllowVoid) {
 
   case lltok::LocalVarID: {
     // Type ::= %4
-    std::pair<Type*, LocTy> &Entry = NumberedTypes[Lex.getUIntVal()];
+    std::pair<Type *, LocTy> &Entry = NumberedTypes[Lex.getUIntVal()];
 
     // If the type hasn't been defined yet, create a forward definition and
     // remember where that forward def'n was seen (in case it never is defined).
@@ -2453,7 +2727,7 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
         return TokError(Twine(Msg) + "non-musttail call");
       if (!InVarArgsFunc)
         return TokError(Twine(Msg) + "musttail call in non-varargs function");
-      Lex.Lex();  // Lex the '...', it is purely for readability.
+      Lex.Lex(); // Lex the '...', it is purely for readability.
       return ParseToken(lltok::rparen, "expected ')' at end of argument list");
     }
 
@@ -2473,15 +2747,15 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
       if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
         return true;
     }
-    ArgList.push_back(ParamInfo(
-        ArgLoc, V, AttributeSet::get(V->getContext(), ArgAttrs)));
+    ArgList.push_back(
+        ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(), ArgAttrs)));
   }
 
   if (IsMustTailCall && InVarArgsFunc)
     return TokError("expected '...' at end of argument list for musttail call "
                     "in varargs function");
 
-  Lex.Lex();  // Lex the ')'.
+  Lex.Lex(); // Lex the ')'.
   return false;
 }
 
@@ -2565,7 +2839,7 @@ bool LLParser::ParseOptionalOperandBundles(
 ///   ::= ArgType (',' ArgType)*
 ///
 bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
-                                 bool &isVarArg){
+                                 bool &isVarArg) {
   isVarArg = false;
   assert(Lex.getKind() == lltok::lparen);
   Lex.Lex(); // eat the (.
@@ -2581,8 +2855,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     AttrBuilder Attrs;
     std::string Name;
 
-    if (ParseType(ArgTy) ||
-        ParseOptionalParamAttrs(Attrs)) return true;
+    if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs))
+      return true;
 
     if (ArgTy->isVoidTy())
       return Error(TypeLoc, "argument can not have void type");
@@ -2608,7 +2882,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
 
       // Otherwise must be an argument type.
       TypeLoc = Lex.getLoc();
-      if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs)) return true;
+      if (ParseType(ArgTy) || ParseOptionalParamAttrs(Attrs))
+        return true;
 
       if (ArgTy->isVoidTy())
         return Error(TypeLoc, "argument can not have void type");
@@ -2654,7 +2929,7 @@ bool LLParser::ParseFunctionType(Type *&Result) {
                    "argument attributes invalid in function type");
   }
 
-  SmallVector<Type*, 16> ArgListTy;
+  SmallVector<Type *, 16> ArgListTy;
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
     ArgListTy.push_back(ArgList[i].Ty);
 
@@ -2665,8 +2940,9 @@ bool LLParser::ParseFunctionType(Type *&Result) {
 /// ParseAnonStructType - Parse an anonymous struct type, which is inlined into
 /// other structs.
 bool LLParser::ParseAnonStructType(Type *&Result, bool Packed) {
-  SmallVector<Type*, 8> Elts;
-  if (ParseStructBody(Elts)) return true;
+  SmallVector<Type *, 8> Elts;
+  if (ParseStructBody(Elts))
+    return true;
 
   Result = StructType::get(Context, Elts, Packed);
   return false;
@@ -2674,7 +2950,7 @@ bool LLParser::ParseAnonStructType(Type *&Result, bool Packed) {
 
 /// ParseStructDefinition - Parse a struct in a 'type' definition.
 bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
-                                     std::pair<Type*, LocTy> &Entry,
+                                     std::pair<Type *, LocTy> &Entry,
                                      Type *&ResultTy) {
   // If the type was already defined, diagnose the redefinition.
   if (Entry.first && !Entry.second.isValid())
@@ -2718,7 +2994,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
 
   StructType *STy = cast<StructType>(Entry.first);
 
-  SmallVector<Type*, 8> Body;
+  SmallVector<Type *, 8> Body;
   if (ParseStructBody(Body) ||
       (isPacked && ParseToken(lltok::greater, "expected '>' in packed struct")))
     return true;
@@ -2734,7 +3010,7 @@ bool LLParser::ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
 ///     ::= '{' Type (',' Type)* '}'
 ///     ::= '<' '{' '}' '>'
 ///     ::= '<' '{' Type (',' Type)* '}' '>'
-bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) {
+bool LLParser::ParseStructBody(SmallVectorImpl<Type *> &Body) {
   assert(Lex.getKind() == lltok::lbrace);
   Lex.Lex(); // Consume the '{'
 
@@ -2744,7 +3020,8 @@ bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) {
 
   LocTy EltTyLoc = Lex.getLoc();
   Type *Ty = nullptr;
-  if (ParseType(Ty)) return true;
+  if (ParseType(Ty))
+    return true;
   Body.push_back(Ty);
 
   if (!StructType::isValidElementType(Ty))
@@ -2752,7 +3029,8 @@ bool LLParser::ParseStructBody(SmallVectorImpl<Type*> &Body) {
 
   while (EatIfPresent(lltok::comma)) {
     EltTyLoc = Lex.getLoc();
-    if (ParseType(Ty)) return true;
+    if (ParseType(Ty))
+      return true;
 
     if (!StructType::isValidElementType(Ty))
       return Error(EltTyLoc, "invalid element type for struct");
@@ -2789,11 +3067,12 @@ bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) {
   Lex.Lex();
 
   if (ParseToken(lltok::kw_x, "expected 'x' after element count"))
-      return true;
+    return true;
 
   LocTy TypeLoc = Lex.getLoc();
   Type *EltTy = nullptr;
-  if (ParseType(EltTy)) return true;
+  if (ParseType(EltTy))
+    return true;
 
   if (ParseToken(isVector ? lltok::greater : lltok::rsquare,
                  "expected end of sequential type"))
@@ -2821,7 +3100,7 @@ bool LLParser::ParseArrayVectorType(Type *&Result, bool isVector) {
 
 LLParser::PerFunctionState::PerFunctionState(LLParser &p, Function &f,
                                              int functionNumber)
-  : P(p), F(f), FunctionNumber(functionNumber) {
+    : P(p), F(f), FunctionNumber(functionNumber) {
 
   // Insert unnamed arguments into the NumberedVals list.
   for (Argument &A : F.args())
@@ -2853,11 +3132,11 @@ bool LLParser::PerFunctionState::FinishFunction() {
   if (!ForwardRefVals.empty())
     return P.Error(ForwardRefVals.begin()->second.second,
                    "use of undefined value '%" + ForwardRefVals.begin()->first +
-                   "'");
+                       "'");
   if (!ForwardRefValIDs.empty())
     return P.Error(ForwardRefValIDs.begin()->second.second,
                    "use of undefined value '%" +
-                   Twine(ForwardRefValIDs.begin()->first) + "'");
+                       Twine(ForwardRefValIDs.begin()->first) + "'");
   return false;
 }
 
@@ -2954,14 +3233,15 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
 
     if (unsigned(NameID) != NumberedVals.size())
       return P.Error(NameLoc, "instruction expected to be numbered '%" +
-                     Twine(NumberedVals.size()) + "'");
+                                  Twine(NumberedVals.size()) + "'");
 
     auto FI = ForwardRefValIDs.find(NameID);
     if (FI != ForwardRefValIDs.end()) {
       Value *Sentinel = FI->second.first;
       if (Sentinel->getType() != Inst->getType())
         return P.Error(NameLoc, "instruction forward referenced with type '" +
-                       getTypeString(FI->second.first->getType()) + "'");
+                                    getTypeString(FI->second.first->getType()) +
+                                    "'");
 
       Sentinel->replaceAllUsesWith(Inst);
       Sentinel->deleteValue();
@@ -2978,7 +3258,8 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
     Value *Sentinel = FI->second.first;
     if (Sentinel->getType() != Inst->getType())
       return P.Error(NameLoc, "instruction forward referenced with type '" +
-                     getTypeString(FI->second.first->getType()) + "'");
+                                  getTypeString(FI->second.first->getType()) +
+                                  "'");
 
     Sentinel->replaceAllUsesWith(Inst);
     Sentinel->deleteValue();
@@ -2990,7 +3271,7 @@ bool LLParser::PerFunctionState::SetInstName(int NameID,
 
   if (Inst->getName() != NameStr)
     return P.Error(NameLoc, "multiple definition of local value named '" +
-                   NameStr + "'");
+                                NameStr + "'");
   return false;
 }
 
@@ -3062,20 +3343,21 @@ BasicBlock *LLParser::PerFunctionState::DefineBB(const std::string &Name,
 bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   ID.Loc = Lex.getLoc();
   switch (Lex.getKind()) {
-  default: return TokError("expected value token");
-  case lltok::GlobalID:  // @42
+  default:
+    return TokError("expected value token");
+  case lltok::GlobalID: // @42
     ID.UIntVal = Lex.getUIntVal();
     ID.Kind = ValID::t_GlobalID;
     break;
-  case lltok::GlobalVar:  // @foo
+  case lltok::GlobalVar: // @foo
     ID.StrVal = Lex.getStrVal();
     ID.Kind = ValID::t_GlobalName;
     break;
-  case lltok::LocalVarID:  // %42
+  case lltok::LocalVarID: // %42
     ID.UIntVal = Lex.getUIntVal();
     ID.Kind = ValID::t_LocalID;
     break;
-  case lltok::LocalVar:  // %foo
+  case lltok::LocalVar: // %foo
     ID.StrVal = Lex.getStrVal();
     ID.Kind = ValID::t_LocalName;
     break;
@@ -3095,15 +3377,23 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     ID.ConstantVal = ConstantInt::getFalse(Context);
     ID.Kind = ValID::t_Constant;
     break;
-  case lltok::kw_null: ID.Kind = ValID::t_Null; break;
-  case lltok::kw_undef: ID.Kind = ValID::t_Undef; break;
-  case lltok::kw_zeroinitializer: ID.Kind = ValID::t_Zero; break;
-  case lltok::kw_none: ID.Kind = ValID::t_None; break;
+  case lltok::kw_null:
+    ID.Kind = ValID::t_Null;
+    break;
+  case lltok::kw_undef:
+    ID.Kind = ValID::t_Undef;
+    break;
+  case lltok::kw_zeroinitializer:
+    ID.Kind = ValID::t_Zero;
+    break;
+  case lltok::kw_none:
+    ID.Kind = ValID::t_None;
+    break;
 
   case lltok::lbrace: {
     // ValID ::= '{' ConstVector '}'
     Lex.Lex();
-    SmallVector<Constant*, 16> Elts;
+    SmallVector<Constant *, 16> Elts;
     if (ParseGlobalValueVector(Elts) ||
         ParseToken(lltok::rbrace, "expected end of struct constant"))
       return true;
@@ -3121,7 +3411,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
     bool isPackedStruct = EatIfPresent(lltok::lbrace);
 
-    SmallVector<Constant*, 16> Elts;
+    SmallVector<Constant *, 16> Elts;
     LocTy FirstEltLoc = Lex.getLoc();
     if (ParseGlobalValueVector(Elts) ||
         (isPackedStruct &&
@@ -3144,23 +3434,24 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     if (!Elts[0]->getType()->isIntegerTy() &&
         !Elts[0]->getType()->isFloatingPointTy() &&
         !Elts[0]->getType()->isPointerTy())
-      return Error(FirstEltLoc,
-            "vector elements must have integer, pointer or floating point type");
+      return Error(
+          FirstEltLoc,
+          "vector elements must have integer, pointer or floating point type");
 
     // Verify that all the vector elements have the same type.
     for (unsigned i = 1, e = Elts.size(); i != e; ++i)
       if (Elts[i]->getType() != Elts[0]->getType())
-        return Error(FirstEltLoc,
-                     "vector element #" + Twine(i) +
-                    " is not of type '" + getTypeString(Elts[0]->getType()));
+        return Error(FirstEltLoc, "vector element #" + Twine(i) +
+                                      " is not of type '" +
+                                      getTypeString(Elts[0]->getType()));
 
     ID.ConstantVal = ConstantVector::get(Elts);
     ID.Kind = ValID::t_Constant;
     return false;
   }
-  case lltok::lsquare: {   // Array Constant
+  case lltok::lsquare: { // Array Constant
     Lex.Lex();
-    SmallVector<Constant*, 16> Elts;
+    SmallVector<Constant *, 16> Elts;
     LocTy FirstEltLoc = Lex.getLoc();
     if (ParseGlobalValueVector(Elts) ||
         ParseToken(lltok::rsquare, "expected end of array constant"))
@@ -3176,27 +3467,28 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
     if (!Elts[0]->getType()->isFirstClassType())
       return Error(FirstEltLoc, "invalid array element type: " +
-                   getTypeString(Elts[0]->getType()));
+                                    getTypeString(Elts[0]->getType()));
 
     ArrayType *ATy = ArrayType::get(Elts[0]->getType(), Elts.size());
 
     // Verify all elements are correct type!
     for (unsigned i = 0, e = Elts.size(); i != e; ++i) {
       if (Elts[i]->getType() != Elts[0]->getType())
-        return Error(FirstEltLoc,
-                     "array element #" + Twine(i) +
-                     " is not of type '" + getTypeString(Elts[0]->getType()));
+        return Error(FirstEltLoc, "array element #" + Twine(i) +
+                                      " is not of type '" +
+                                      getTypeString(Elts[0]->getType()));
     }
 
     ID.ConstantVal = ConstantArray::get(ATy, Elts);
     ID.Kind = ValID::t_Constant;
     return false;
   }
-  case lltok::kw_c:  // c "foo"
+  case lltok::kw_c: // c "foo"
     Lex.Lex();
-    ID.ConstantVal = ConstantDataArray::getString(Context, Lex.getStrVal(),
-                                                  false);
-    if (ParseToken(lltok::StringConstant, "expected string")) return true;
+    ID.ConstantVal =
+        ConstantDataArray::getString(Context, Lex.getStrVal(), false);
+    if (ParseToken(lltok::StringConstant, "expected string"))
+      return true;
     ID.Kind = ValID::t_Constant;
     return false;
 
@@ -3213,8 +3505,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         ParseToken(lltok::StringConstant, "expected constraint string"))
       return true;
     ID.StrVal2 = Lex.getStrVal();
-    ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1) |
-      (unsigned(AsmDialect)<<2);
+    ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack) << 1) |
+                 (unsigned(AsmDialect) << 2);
     ID.Kind = ValID::t_InlineAsm;
     return false;
   }
@@ -3227,7 +3519,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
     if (ParseToken(lltok::lparen, "expected '(' in block address expression") ||
         ParseValID(Fn) ||
-        ParseToken(lltok::comma, "expected comma in block address expression")||
+        ParseToken(lltok::comma,
+                   "expected comma in block address expression") ||
         ParseValID(Label) ||
         ParseToken(lltok::rparen, "expected ')' in block address expression"))
       return true;
@@ -3258,9 +3551,9 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     if (!F) {
       // Make a global variable as a placeholder for this reference.
       GlobalValue *&FwdRef =
-          ForwardRefBlockAddresses.insert(std::make_pair(
-                                              std::move(Fn),
-                                              std::map<ValID, GlobalValue *>()))
+          ForwardRefBlockAddresses
+              .insert(std::make_pair(std::move(Fn),
+                                     std::map<ValID, GlobalValue *>()))
               .first->second.insert(std::make_pair(std::move(Label), nullptr))
               .first->second;
       if (!FwdRef)
@@ -3321,10 +3614,10 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       return true;
     if (!CastInst::castIsValid((Instruction::CastOps)Opc, SrcVal, DestTy))
       return Error(ID.Loc, "invalid cast opcode for cast from '" +
-                   getTypeString(SrcVal->getType()) + "' to '" +
-                   getTypeString(DestTy) + "'");
-    ID.ConstantVal = ConstantExpr::getCast((Instruction::CastOps)Opc,
-                                                 SrcVal, DestTy);
+                               getTypeString(SrcVal->getType()) + "' to '" +
+                               getTypeString(DestTy) + "'");
+    ID.ConstantVal =
+        ConstantExpr::getCast((Instruction::CastOps)Opc, SrcVal, DestTy);
     ID.Kind = ValID::t_Constant;
     return false;
   }
@@ -3332,9 +3625,9 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
     Constant *Val;
     SmallVector<unsigned, 4> Indices;
-    if (ParseToken(lltok::lparen, "expected '(' in extractvalue constantexpr")||
-        ParseGlobalTypeAndValue(Val) ||
-        ParseIndexList(Indices) ||
+    if (ParseToken(lltok::lparen,
+                   "expected '(' in extractvalue constantexpr") ||
+        ParseGlobalTypeAndValue(Val) || ParseIndexList(Indices) ||
         ParseToken(lltok::rparen, "expected ')' in extractvalue constantexpr"))
       return true;
 
@@ -3350,11 +3643,11 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
     Constant *Val0, *Val1;
     SmallVector<unsigned, 4> Indices;
-    if (ParseToken(lltok::lparen, "expected '(' in insertvalue constantexpr")||
+    if (ParseToken(lltok::lparen, "expected '(' in insertvalue constantexpr") ||
         ParseGlobalTypeAndValue(Val0) ||
-        ParseToken(lltok::comma, "expected comma in insertvalue constantexpr")||
-        ParseGlobalTypeAndValue(Val1) ||
-        ParseIndexList(Indices) ||
+        ParseToken(lltok::comma,
+                   "expected comma in insertvalue constantexpr") ||
+        ParseGlobalTypeAndValue(Val1) || ParseIndexList(Indices) ||
         ParseToken(lltok::rparen, "expected ')' in insertvalue constantexpr"))
       return true;
     if (!Val0->getType()->isAggregateType())
@@ -3404,7 +3697,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     ID.Kind = ValID::t_Constant;
     return false;
   }
- 
+
   // Unary Operators.
   case lltok::kw_fneg: {
     unsigned Opc = Lex.getUIntVal();
@@ -3414,14 +3707,15 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         ParseGlobalTypeAndValue(Val) ||
         ParseToken(lltok::rparen, "expected ')' in unary constantexpr"))
       return true;
-    
+
     // Check that the type is valid for the operator.
     switch (Opc) {
     case Instruction::FNeg:
       if (!Val->getType()->isFPOrFPVectorTy())
         return Error(ID.Loc, "constexpr requires fp operands");
       break;
-    default: llvm_unreachable("Unknown unary operator!");
+    default:
+      llvm_unreachable("Unknown unary operator!");
     }
     unsigned Flags = 0;
     Constant *C = ConstantExpr::get(Opc, Val, Flags);
@@ -3496,12 +3790,16 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       if (!Val0->getType()->isFPOrFPVectorTy())
         return Error(ID.Loc, "constexpr requires fp operands");
       break;
-    default: llvm_unreachable("Unknown binary operator!");
+    default:
+      llvm_unreachable("Unknown binary operator!");
     }
     unsigned Flags = 0;
-    if (NUW)   Flags |= OverflowingBinaryOperator::NoUnsignedWrap;
-    if (NSW)   Flags |= OverflowingBinaryOperator::NoSignedWrap;
-    if (Exact) Flags |= PossiblyExactOperator::IsExact;
+    if (NUW)
+      Flags |= OverflowingBinaryOperator::NoUnsignedWrap;
+    if (NSW)
+      Flags |= OverflowingBinaryOperator::NoSignedWrap;
+    if (Exact)
+      Flags |= PossiblyExactOperator::IsExact;
     Constant *C = ConstantExpr::get(Opc, Val0, Val1, Flags);
     ID.ConstantVal = C;
     ID.Kind = ValID::t_Constant;
@@ -3537,7 +3835,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   case lltok::kw_extractelement:
   case lltok::kw_select: {
     unsigned Opc = Lex.getUIntVal();
-    SmallVector<Constant*, 16> Elts;
+    SmallVector<Constant *, 16> Elts;
     bool InBounds = false;
     Type *Ty;
     Lex.Lex();
@@ -3562,8 +3860,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       return true;
 
     if (Opc == Instruction::GetElementPtr) {
-      if (Elts.size() == 0 ||
-          !Elts[0]->getType()->isPtrOrPtrVectorTy())
+      if (Elts.size() == 0 || !Elts[0]->getType()->isPtrOrPtrVectorTy())
         return Error(ID.Loc, "base of getelementptr must be a pointer");
 
       Type *BaseType = Elts[0]->getType();
@@ -3593,7 +3890,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         }
       }
 
-      SmallPtrSet<Type*, 4> Visited;
+      SmallPtrSet<Type *, 4> Visited;
       if (!Indices.empty() && !Ty->isSized(&Visited))
         return Error(ID.Loc, "base element of getelementptr must be sized");
 
@@ -3612,8 +3909,8 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     } else if (Opc == Instruction::Select) {
       if (Elts.size() != 3)
         return Error(ID.Loc, "expected three operands to select");
-      if (const char *Reason = SelectInst::areInvalidOperands(Elts[0], Elts[1],
-                                                              Elts[2]))
+      if (const char *Reason =
+              SelectInst::areInvalidOperands(Elts[0], Elts[1], Elts[2]))
         return Error(ID.Loc, Reason);
       ID.ConstantVal = ConstantExpr::getSelect(Elts[0], Elts[1], Elts[2]);
     } else if (Opc == Instruction::ShuffleVector) {
@@ -3622,7 +3919,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
       if (!ShuffleVectorInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
         return Error(ID.Loc, "invalid operands to shufflevector");
       ID.ConstantVal =
-                 ConstantExpr::getShuffleVector(Elts[0], Elts[1],Elts[2]);
+          ConstantExpr::getShuffleVector(Elts[0], Elts[1], Elts[2]);
     } else if (Opc == Instruction::ExtractElement) {
       if (Elts.size() != 2)
         return Error(ID.Loc, "expected two operands to extractelement");
@@ -3632,11 +3929,11 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     } else {
       assert(Opc == Instruction::InsertElement && "Unknown opcode");
       if (Elts.size() != 3)
-      return Error(ID.Loc, "expected three operands to insertelement");
+        return Error(ID.Loc, "expected three operands to insertelement");
       if (!InsertElementInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
         return Error(ID.Loc, "invalid insertelement operands");
       ID.ConstantVal =
-                 ConstantExpr::getInsertElement(Elts[0], Elts[1],Elts[2]);
+          ConstantExpr::getInsertElement(Elts[0], Elts[1], Elts[2]);
     }
 
     ID.Kind = ValID::t_Constant;
@@ -3662,8 +3959,7 @@ bool LLParser::ParseGlobalValue(Type *Ty, Constant *&C) {
 
 bool LLParser::ParseGlobalTypeAndValue(Constant *&V) {
   Type *Ty = nullptr;
-  return ParseType(Ty) ||
-         ParseGlobalValue(Ty, V);
+  return ParseType(Ty) || ParseGlobalValue(Ty, V);
 }
 
 bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) {
@@ -3695,10 +3991,8 @@ bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) {
 bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
                                       Optional<unsigned> *InRangeOp) {
   // Empty list.
-  if (Lex.getKind() == lltok::rbrace ||
-      Lex.getKind() == lltok::rsquare ||
-      Lex.getKind() == lltok::greater ||
-      Lex.getKind() == lltok::rparen)
+  if (Lex.getKind() == lltok::rbrace || Lex.getKind() == lltok::rsquare ||
+      Lex.getKind() == lltok::greater || Lex.getKind() == lltok::rparen)
     return false;
 
   do {
@@ -3706,7 +4000,8 @@ bool LLParser::ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
       *InRangeOp = Elts.size();
 
     Constant *C;
-    if (ParseGlobalTypeAndValue(C)) return true;
+    if (ParseGlobalTypeAndValue(C))
+      return true;
     Elts.push_back(C);
   } while (EatIfPresent(lltok::comma));
 
@@ -3730,8 +4025,7 @@ bool LLParser::ParseMDNode(MDNode *&N) {
   if (Lex.getKind() == lltok::MetadataVar)
     return ParseSpecializedMDNode(N);
 
-  return ParseToken(lltok::exclaim, "expected '!' here") ||
-         ParseMDNodeTail(N);
+  return ParseToken(lltok::exclaim, "expected '!' here") || ParseMDNodeTail(N);
 }
 
 bool LLParser::ParseMDNodeTail(MDNode *&N) {
@@ -3770,11 +4064,7 @@ template <class FieldTypeA, class FieldTypeB> struct MDEitherFieldImpl {
   FieldTypeB B;
   bool Seen;
 
-  enum {
-    IsInvalid = 0,
-    IsTypeA = 1,
-    IsTypeB = 2
-  } WhatIs;
+  enum { IsInvalid = 0, IsTypeA = 1, IsTypeB = 2 } WhatIs;
 
   void assign(FieldTypeA A) {
     Seen = true;
@@ -3817,7 +4107,7 @@ struct DwarfTagField : public MDUnsignedField {
 struct DwarfMacinfoTypeField : public MDUnsignedField {
   DwarfMacinfoTypeField() : MDUnsignedField(0, dwarf::DW_MACINFO_vendor_ext) {}
   DwarfMacinfoTypeField(dwarf::MacinfoRecordType DefaultType)
-    : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {}
+      : MDUnsignedField(DefaultType, dwarf::DW_MACINFO_vendor_ext) {}
 };
 
 struct DwarfAttEncodingField : public MDUnsignedField {
@@ -3987,8 +4277,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 
   unsigned Macinfo = dwarf::getMacinfo(Lex.getStrVal());
   if (Macinfo == dwarf::DW_MACINFO_invalid)
-    return TokError(
-        "invalid DWARF macinfo type" + Twine(" '") + Lex.getStrVal() + "'");
+    return TokError("invalid DWARF macinfo type" + Twine(" '") +
+                    Lex.getStrVal() + "'");
   assert(Macinfo <= Result.Max && "Expected valid DWARF macinfo type");
 
   Result.assign(Macinfo);
@@ -4043,8 +4333,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
 
   unsigned CC = dwarf::getCallingConvention(Lex.getStrVal());
   if (!CC)
-    return TokError("invalid DWARF calling convention" + Twine(" '") + Lex.getStrVal() +
-                    "'");
+    return TokError("invalid DWARF calling convention" + Twine(" '") +
+                    Lex.getStrVal() + "'");
   assert(CC <= Result.Max && "Expected valid DWARF calling convention");
   Result.assign(CC);
   Lex.Lex();
@@ -4052,7 +4342,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DwarfCCField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name, EmissionKindField &Result) {
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
+                            EmissionKindField &Result) {
   if (Lex.getKind() == lltok::APSInt)
     return ParseMDField(Loc, Name, static_cast<MDUnsignedField &>(Result));
 
@@ -4188,8 +4479,7 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name, DISPFlagField &Result) {
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
-                            MDSignedField &Result) {
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDSignedField &Result) {
   if (Lex.getKind() != lltok::APSInt)
     return TokError("expected signed integer");
 
@@ -4316,8 +4606,8 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
       DIFile::getChecksumKind(Lex.getStrVal());
 
   if (Lex.getKind() != lltok::ChecksumKind || !CSKind)
-    return TokError(
-        "invalid checksum kind" + Twine(" '") + Lex.getStrVal() + "'");
+    return TokError("invalid checksum kind" + Twine(" '") + Lex.getStrVal() +
+                    "'");
 
   Result.assign(*CSKind);
   Lex.Lex();
@@ -4387,10 +4677,13 @@ bool LLParser::ParseSpecializedMDNode(MDNode *&N, bool IsDistinct) {
   VISIT_MD_FIELDS(DECLARE_FIELD, DECLARE_FIELD)                                \
   do {                                                                         \
     LocTy ClosingLoc;                                                          \
-    if (ParseMDFieldsImpl([&]() -> bool {                                      \
-      VISIT_MD_FIELDS(PARSE_MD_FIELD, PARSE_MD_FIELD)                          \
-      return TokError(Twine("invalid field '") + Lex.getStrVal() + "'");       \
-    }, ClosingLoc))                                                            \
+    if (ParseMDFieldsImpl(                                                     \
+            [&]() -> bool {                                                    \
+              VISIT_MD_FIELDS(PARSE_MD_FIELD, PARSE_MD_FIELD)                  \
+              return TokError(Twine("invalid field '") + Lex.getStrVal() +     \
+                              "'");                                            \
+            },                                                                 \
+            ClosingLoc))                                                       \
       return true;                                                             \
     VISIT_MD_FIELDS(NOP_FIELD, REQUIRE_FIELD)                                  \
   } while (false)
@@ -4520,11 +4813,10 @@ bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   if (dwarfAddressSpace.Val != UINT32_MAX)
     DWARFAddressSpace = dwarfAddressSpace.Val;
 
-  Result = GET_OR_DISTINCT(DIDerivedType,
-                           (Context, tag.Val, name.Val, file.Val, line.Val,
-                            scope.Val, baseType.Val, size.Val, align.Val,
-                            offset.Val, DWARFAddressSpace, flags.Val,
-                            extraData.Val));
+  Result = GET_OR_DISTINCT(
+      DIDerivedType, (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val,
+                      baseType.Val, size.Val, align.Val, offset.Val,
+                      DWARFAddressSpace, flags.Val, extraData.Val));
   return false;
 }
 
@@ -4554,8 +4846,8 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
     if (auto *CT = DICompositeType::buildODRType(
             Context, *identifier.Val, tag.Val, name.Val, file.Val, line.Val,
             scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val,
-            elements.Val, runtimeLang.Val, vtableHolder.Val,
-            templateParams.Val, discriminator.Val)) {
+            elements.Val, runtimeLang.Val, vtableHolder.Val, templateParams.Val,
+            discriminator.Val)) {
       Result = CT;
       return false;
     }
@@ -4611,8 +4903,8 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
   Optional<MDString *> OptSource;
   if (source.Seen)
     OptSource = source.Val;
-  Result = GET_OR_DISTINCT(DIFile, (Context, filename.Val, directory.Val,
-                                    OptChecksum, OptSource));
+  Result = GET_OR_DISTINCT(
+      DIFile, (Context, filename.Val, directory.Val, OptChecksum, OptSource));
   return false;
 }
 
@@ -4750,13 +5042,12 @@ bool LLParser::ParseDICommonBlock(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(declaration, MDField, );                                            \
   OPTIONAL(name, MDStringField, );                                             \
   OPTIONAL(file, MDField, );                                                   \
-  OPTIONAL(line, LineField, );						       
+  OPTIONAL(line, LineField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DICommonBlock,
-                           (Context, scope.Val, declaration.Val, name.Val,
-                            file.Val, line.Val));
+  Result = GET_OR_DISTINCT(DICommonBlock, (Context, scope.Val, declaration.Val,
+                                           name.Val, file.Val, line.Val));
   return false;
 }
 
@@ -4776,7 +5067,8 @@ bool LLParser::ParseDINamespace(MDNode *&Result, bool IsDistinct) {
 }
 
 /// ParseDIMacro:
-///   ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value: "SomeValue")
+///   ::= !DIMacro(macinfo: type, line: 9, name: "SomeMacro", value:
+///   "SomeValue")
 bool LLParser::ParseDIMacro(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(type, DwarfMacinfoTypeField, );                                     \
@@ -4820,8 +5112,9 @@ bool LLParser::ParseDIModule(MDNode *&Result, bool IsDistinct) {
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIModule, (Context, scope.Val, name.Val,
-                           configMacros.Val, includePath.Val, isysroot.Val));
+  Result =
+      GET_OR_DISTINCT(DIModule, (Context, scope.Val, name.Val, configMacros.Val,
+                                 includePath.Val, isysroot.Val));
   return false;
 }
 
@@ -4954,7 +5247,8 @@ bool LLParser::ParseDIExpression(MDNode *&Result, bool IsDistinct) {
           Elements.push_back(Op);
           continue;
         }
-        return TokError(Twine("invalid DWARF attribute encoding '") + Lex.getStrVal() + "'");
+        return TokError(Twine("invalid DWARF attribute encoding '") +
+                        Lex.getStrVal() + "'");
       }
 
       if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
@@ -5130,11 +5424,13 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
 
   switch (ID.Kind) {
   case ValID::t_LocalID:
-    if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
+    if (!PFS)
+      return Error(ID.Loc, "invalid use of function-local name");
     V = PFS->GetVal(ID.UIntVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_LocalName:
-    if (!PFS) return Error(ID.Loc, "invalid use of function-local name");
+    if (!PFS)
+      return Error(ID.Loc, "invalid use of function-local name");
     V = PFS->GetVal(ID.StrVal, Ty, ID.Loc, IsCall);
     return V == nullptr;
   case ValID::t_InlineAsm: {
@@ -5170,14 +5466,14 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
         ID.APFloatVal.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
                               &Ignored);
       else if (Ty->isFloatTy())
-        ID.APFloatVal.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
-                              &Ignored);
+        ID.APFloatVal.convert(APFloat::IEEEsingle(),
+                              APFloat::rmNearestTiesToEven, &Ignored);
     }
     V = ConstantFP::get(Context, ID.APFloatVal);
 
     if (V->getType() != Ty)
       return Error(ID.Loc, "floating point constant does not have type '" +
-                   getTypeString(Ty) + "'");
+                               getTypeString(Ty) + "'");
 
     return false;
   case ValID::t_Null:
@@ -5225,8 +5521,10 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
       // Verify that the elements are compatible with the structtype.
       for (unsigned i = 0, e = ID.UIntVal; i != e; ++i)
         if (ID.ConstantStructElts[i]->getType() != ST->getElementType(i))
-          return Error(ID.Loc, "element " + Twine(i) +
-                    " of struct initializer doesn't match struct element type");
+          return Error(
+              ID.Loc,
+              "element " + Twine(i) +
+                  " of struct initializer doesn't match struct element type");
 
       V = ConstantStruct::get(
           ST, makeArrayRef(ID.ConstantStructElts.get(), ID.UIntVal));
@@ -5274,15 +5572,15 @@ bool LLParser::ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS) {
 
 bool LLParser::ParseTypeAndValue(Value *&V, PerFunctionState *PFS) {
   Type *Ty = nullptr;
-  return ParseType(Ty) ||
-         ParseValue(Ty, V, PFS);
+  return ParseType(Ty) || ParseValue(Ty, V, PFS);
 }
 
 bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
                                       PerFunctionState &PFS) {
   Value *V;
   Loc = Lex.getLoc();
-  if (ParseTypeAndValue(V, PFS)) return true;
+  if (ParseTypeAndValue(V, PFS))
+    return true;
   if (!isa<BasicBlock>(V))
     return Error(Loc, "expected a basic block");
   BB = cast<BasicBlock>(V);
@@ -5347,7 +5645,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::string FunctionName;
   if (Lex.getKind() == lltok::GlobalVar) {
     FunctionName = Lex.getStrVal();
-  } else if (Lex.getKind() == lltok::GlobalID) {     // @42 is ok.
+  } else if (Lex.getKind() == lltok::GlobalID) { // @42 is ok.
     unsigned NameID = Lex.getUIntVal();
 
     if (NameID != NumberedVals.size())
@@ -5383,18 +5681,13 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
       ParseOptionalProgramAddrSpace(AddrSpace) ||
       ParseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
                                  BuiltinLoc) ||
-      (EatIfPresent(lltok::kw_section) &&
-       ParseStringConstant(Section)) ||
-      (EatIfPresent(lltok::kw_partition) &&
-       ParseStringConstant(Partition)) ||
+      (EatIfPresent(lltok::kw_section) && ParseStringConstant(Section)) ||
+      (EatIfPresent(lltok::kw_partition) && ParseStringConstant(Partition)) ||
       parseOptionalComdat(FunctionName, C) ||
       ParseOptionalAlignment(Alignment) ||
-      (EatIfPresent(lltok::kw_gc) &&
-       ParseStringConstant(GC)) ||
-      (EatIfPresent(lltok::kw_prefix) &&
-       ParseGlobalTypeAndValue(Prefix)) ||
-      (EatIfPresent(lltok::kw_prologue) &&
-       ParseGlobalTypeAndValue(Prologue)) ||
+      (EatIfPresent(lltok::kw_gc) && ParseStringConstant(GC)) ||
+      (EatIfPresent(lltok::kw_prefix) && ParseGlobalTypeAndValue(Prefix)) ||
+      (EatIfPresent(lltok::kw_prologue) && ParseGlobalTypeAndValue(Prologue)) ||
       (EatIfPresent(lltok::kw_personality) &&
        ParseGlobalTypeAndValue(PersonalityFn)))
     return true;
@@ -5410,7 +5703,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
 
   // Okay, if we got here, the function is syntactically valid.  Convert types
   // and do semantic checks.
-  std::vector<Type*> ParamTypeList;
+  std::vector<Type *> ParamTypeList;
   SmallVector<AttributeSet, 8> Attrs;
 
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
@@ -5425,8 +5718,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
 
-  FunctionType *FT =
-    FunctionType::get(RetType, ParamTypeList, isVarArg);
+  FunctionType *FT = FunctionType::get(RetType, ParamTypeList, isVarArg);
   PointerType *PFT = PointerType::get(FT, AddrSpace);
 
   Fn = nullptr;
@@ -5438,17 +5730,21 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
       Fn = M->getFunction(FunctionName);
       if (!Fn)
         return Error(FRVI->second.second, "invalid forward reference to "
-                     "function as global value!");
+                                          "function as global value!");
       if (Fn->getType() != PFT)
-        return Error(FRVI->second.second, "invalid forward reference to "
-                     "function '" + FunctionName + "' with wrong type: "
-                     "expected '" + getTypeString(PFT) + "' but was '" +
-                     getTypeString(Fn->getType()) + "'");
+        return Error(FRVI->second.second,
+                     "invalid forward reference to "
+                     "function '" +
+                         FunctionName +
+                         "' with wrong type: "
+                         "expected '" +
+                         getTypeString(PFT) + "' but was '" +
+                         getTypeString(Fn->getType()) + "'");
       ForwardRefVals.erase(FRVI);
     } else if ((Fn = M->getFunction(FunctionName))) {
       // Reject redefinitions.
-      return Error(NameLoc, "invalid redefinition of function '" +
-                   FunctionName + "'");
+      return Error(NameLoc,
+                   "invalid redefinition of function '" + FunctionName + "'");
     } else if (M->getNamedValue(FunctionName)) {
       return Error(NameLoc, "redefinition of function '@" + FunctionName + "'");
     }
@@ -5461,9 +5757,11 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
       Fn = cast<Function>(I->second.first);
       if (Fn->getType() != PFT)
         return Error(NameLoc, "type of definition and forward reference of '@" +
-                     Twine(NumberedVals.size()) + "' disagree: "
-                     "expected '" + getTypeString(PFT) + "' but was '" +
-                     getTypeString(Fn->getType()) + "'");
+                                  Twine(NumberedVals.size()) +
+                                  "' disagree: "
+                                  "expected '" +
+                                  getTypeString(PFT) + "' but was '" +
+                                  getTypeString(Fn->getType()) + "'");
       ForwardRefValIDs.erase(I);
     }
   }
@@ -5491,7 +5789,8 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setPartition(Partition);
   Fn->setComdat(C);
   Fn->setPersonalityFn(PersonalityFn);
-  if (!GC.empty()) Fn->setGC(GC);
+  if (!GC.empty())
+    Fn->setGC(GC);
   Fn->setPrefixData(Prefix);
   Fn->setPrologueData(Prologue);
   ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
@@ -5500,14 +5799,15 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Function::arg_iterator ArgIt = Fn->arg_begin();
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i, ++ArgIt) {
     // If the argument has a name, insert it into the argument symbol table.
-    if (ArgList[i].Name.empty()) continue;
+    if (ArgList[i].Name.empty())
+      continue;
 
     // Set the name, if it conflicted, it will be auto-renamed.
     ArgIt->setName(ArgList[i].Name);
 
     if (ArgIt->getName() != ArgList[i].Name)
-      return Error(ArgList[i].Loc, "redefinition of argument '%" +
-                   ArgList[i].Name + "'");
+      return Error(ArgList[i].Loc,
+                   "redefinition of argument '%" + ArgList[i].Name + "'");
   }
 
   if (isDefine)
@@ -5570,10 +5870,11 @@ bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() {
 bool LLParser::ParseFunctionBody(Function &Fn) {
   if (Lex.getKind() != lltok::lbrace)
     return TokError("expected '{' in function body");
-  Lex.Lex();  // eat the {.
+  Lex.Lex(); // eat the {.
 
   int FunctionNumber = -1;
-  if (!Fn.hasName()) FunctionNumber = NumberedVals.size()-1;
+  if (!Fn.hasName())
+    FunctionNumber = NumberedVals.size() - 1;
 
   PerFunctionState PFS(*this, Fn, FunctionNumber);
 
@@ -5589,7 +5890,8 @@ bool LLParser::ParseFunctionBody(Function &Fn) {
 
   while (Lex.getKind() != lltok::rbrace &&
          Lex.getKind() != lltok::kw_uselistorder)
-    if (ParseBasicBlock(PFS)) return true;
+    if (ParseBasicBlock(PFS))
+      return true;
 
   while (Lex.getKind() != lltok::rbrace)
     if (ParseUseListOrder(&PFS))
@@ -5645,8 +5947,10 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
     }
 
     switch (ParseInstruction(Inst, BB, PFS)) {
-    default: llvm_unreachable("Unknown ParseInstruction result!");
-    case InstError: return true;
+    default:
+      llvm_unreachable("Unknown ParseInstruction result!");
+    case InstError:
+      return true;
     case InstNormal:
       BB->getInstList().push_back(Inst);
 
@@ -5667,7 +5971,8 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
     }
 
     // Set the name on the instruction.
-    if (PFS.SetInstName(NameID, NameStr, NameLoc, Inst)) return true;
+    if (PFS.SetInstName(NameID, NameStr, NameLoc, Inst))
+      return true;
   } while (!Inst->isTerminator());
 
   return false;
@@ -5686,28 +5991,43 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
     return TokError("found end of file when expecting more instructions");
   LocTy Loc = Lex.getLoc();
   unsigned KeywordVal = Lex.getUIntVal();
-  Lex.Lex();  // Eat the keyword.
+  Lex.Lex(); // Eat the keyword.
 
   switch (Token) {
-  default:                    return Error(Loc, "expected instruction opcode");
+  default:
+    return Error(Loc, "expected instruction opcode");
   // Terminator Instructions.
-  case lltok::kw_unreachable: Inst = new UnreachableInst(Context); return false;
-  case lltok::kw_ret:         return ParseRet(Inst, BB, PFS);
-  case lltok::kw_br:          return ParseBr(Inst, PFS);
-  case lltok::kw_switch:      return ParseSwitch(Inst, PFS);
-  case lltok::kw_indirectbr:  return ParseIndirectBr(Inst, PFS);
-  case lltok::kw_invoke:      return ParseInvoke(Inst, PFS);
-  case lltok::kw_resume:      return ParseResume(Inst, PFS);
-  case lltok::kw_cleanupret:  return ParseCleanupRet(Inst, PFS);
-  case lltok::kw_catchret:    return ParseCatchRet(Inst, PFS);
-  case lltok::kw_catchswitch: return ParseCatchSwitch(Inst, PFS);
-  case lltok::kw_catchpad:    return ParseCatchPad(Inst, PFS);
-  case lltok::kw_cleanuppad:  return ParseCleanupPad(Inst, PFS);
-  case lltok::kw_callbr:      return ParseCallBr(Inst, PFS);
+  case lltok::kw_unreachable:
+    Inst = new UnreachableInst(Context);
+    return false;
+  case lltok::kw_ret:
+    return ParseRet(Inst, BB, PFS);
+  case lltok::kw_br:
+    return ParseBr(Inst, PFS);
+  case lltok::kw_switch:
+    return ParseSwitch(Inst, PFS);
+  case lltok::kw_indirectbr:
+    return ParseIndirectBr(Inst, PFS);
+  case lltok::kw_invoke:
+    return ParseInvoke(Inst, PFS);
+  case lltok::kw_resume:
+    return ParseResume(Inst, PFS);
+  case lltok::kw_cleanupret:
+    return ParseCleanupRet(Inst, PFS);
+  case lltok::kw_catchret:
+    return ParseCatchRet(Inst, PFS);
+  case lltok::kw_catchswitch:
+    return ParseCatchSwitch(Inst, PFS);
+  case lltok::kw_catchpad:
+    return ParseCatchPad(Inst, PFS);
+  case lltok::kw_cleanuppad:
+    return ParseCleanupPad(Inst, PFS);
+  case lltok::kw_callbr:
+    return ParseCallBr(Inst, PFS);
   // Unary Operators.
   case lltok::kw_fneg: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
-    int Res = ParseUnaryOp(Inst, PFS, KeywordVal, /*IsFP*/true);
+    int Res = ParseUnaryOp(Inst, PFS, KeywordVal, /*IsFP*/ true);
     if (Res != 0)
       return Res;
     if (FMF.any())
@@ -5721,12 +6041,16 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_shl: {
     bool NUW = EatIfPresent(lltok::kw_nuw);
     bool NSW = EatIfPresent(lltok::kw_nsw);
-    if (!NUW) NUW = EatIfPresent(lltok::kw_nuw);
+    if (!NUW)
+      NUW = EatIfPresent(lltok::kw_nuw);
 
-    if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/false)) return true;
+    if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ false))
+      return true;
 
-    if (NUW) cast<BinaryOperator>(Inst)->setHasNoUnsignedWrap(true);
-    if (NSW) cast<BinaryOperator>(Inst)->setHasNoSignedWrap(true);
+    if (NUW)
+      cast<BinaryOperator>(Inst)->setHasNoUnsignedWrap(true);
+    if (NSW)
+      cast<BinaryOperator>(Inst)->setHasNoSignedWrap(true);
     return false;
   }
   case lltok::kw_fadd:
@@ -5735,7 +6059,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fdiv:
   case lltok::kw_frem: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
-    int Res = ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/true);
+    int Res = ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ true);
     if (Res != 0)
       return Res;
     if (FMF.any())
@@ -5749,18 +6073,23 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_ashr: {
     bool Exact = EatIfPresent(lltok::kw_exact);
 
-    if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/false)) return true;
-    if (Exact) cast<BinaryOperator>(Inst)->setIsExact(true);
+    if (ParseArithmetic(Inst, PFS, KeywordVal, /*IsFP*/ false))
+      return true;
+    if (Exact)
+      cast<BinaryOperator>(Inst)->setIsExact(true);
     return false;
   }
 
   case lltok::kw_urem:
-  case lltok::kw_srem:   return ParseArithmetic(Inst, PFS, KeywordVal,
-                                                /*IsFP*/false);
+  case lltok::kw_srem:
+    return ParseArithmetic(Inst, PFS, KeywordVal,
+                           /*IsFP*/ false);
   case lltok::kw_and:
   case lltok::kw_or:
-  case lltok::kw_xor:    return ParseLogical(Inst, PFS, KeywordVal);
-  case lltok::kw_icmp:   return ParseCompare(Inst, PFS, KeywordVal);
+  case lltok::kw_xor:
+    return ParseLogical(Inst, PFS, KeywordVal);
+  case lltok::kw_icmp:
+    return ParseCompare(Inst, PFS, KeywordVal);
   case lltok::kw_fcmp: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
     int Res = ParseCompare(Inst, PFS, KeywordVal);
@@ -5784,7 +6113,8 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fptoui:
   case lltok::kw_fptosi:
   case lltok::kw_inttoptr:
-  case lltok::kw_ptrtoint:       return ParseCast(Inst, PFS, KeywordVal);
+  case lltok::kw_ptrtoint:
+    return ParseCast(Inst, PFS, KeywordVal);
   // Other.
   case lltok::kw_select: {
     FastMathFlags FMF = EatFastMathFlagsIfPresent();
@@ -5799,27 +6129,46 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
     }
     return 0;
   }
-  case lltok::kw_va_arg:         return ParseVA_Arg(Inst, PFS);
-  case lltok::kw_extractelement: return ParseExtractElement(Inst, PFS);
-  case lltok::kw_insertelement:  return ParseInsertElement(Inst, PFS);
-  case lltok::kw_shufflevector:  return ParseShuffleVector(Inst, PFS);
-  case lltok::kw_phi:            return ParsePHI(Inst, PFS);
-  case lltok::kw_landingpad:     return ParseLandingPad(Inst, PFS);
+  case lltok::kw_va_arg:
+    return ParseVA_Arg(Inst, PFS);
+  case lltok::kw_extractelement:
+    return ParseExtractElement(Inst, PFS);
+  case lltok::kw_insertelement:
+    return ParseInsertElement(Inst, PFS);
+  case lltok::kw_shufflevector:
+    return ParseShuffleVector(Inst, PFS);
+  case lltok::kw_phi:
+    return ParsePHI(Inst, PFS);
+  case lltok::kw_landingpad:
+    return ParseLandingPad(Inst, PFS);
   // Call.
-  case lltok::kw_call:     return ParseCall(Inst, PFS, CallInst::TCK_None);
-  case lltok::kw_tail:     return ParseCall(Inst, PFS, CallInst::TCK_Tail);
-  case lltok::kw_musttail: return ParseCall(Inst, PFS, CallInst::TCK_MustTail);
-  case lltok::kw_notail:   return ParseCall(Inst, PFS, CallInst::TCK_NoTail);
+  case lltok::kw_call:
+    return ParseCall(Inst, PFS, CallInst::TCK_None);
+  case lltok::kw_tail:
+    return ParseCall(Inst, PFS, CallInst::TCK_Tail);
+  case lltok::kw_musttail:
+    return ParseCall(Inst, PFS, CallInst::TCK_MustTail);
+  case lltok::kw_notail:
+    return ParseCall(Inst, PFS, CallInst::TCK_NoTail);
   // Memory.
-  case lltok::kw_alloca:         return ParseAlloc(Inst, PFS);
-  case lltok::kw_load:           return ParseLoad(Inst, PFS);
-  case lltok::kw_store:          return ParseStore(Inst, PFS);
-  case lltok::kw_cmpxchg:        return ParseCmpXchg(Inst, PFS);
-  case lltok::kw_atomicrmw:      return ParseAtomicRMW(Inst, PFS);
-  case lltok::kw_fence:          return ParseFence(Inst, PFS);
-  case lltok::kw_getelementptr: return ParseGetElementPtr(Inst, PFS);
-  case lltok::kw_extractvalue:  return ParseExtractValue(Inst, PFS);
-  case lltok::kw_insertvalue:   return ParseInsertValue(Inst, PFS);
+  case lltok::kw_alloca:
+    return ParseAlloc(Inst, PFS);
+  case lltok::kw_load:
+    return ParseLoad(Inst, PFS);
+  case lltok::kw_store:
+    return ParseStore(Inst, PFS);
+  case lltok::kw_cmpxchg:
+    return ParseCmpXchg(Inst, PFS);
+  case lltok::kw_atomicrmw:
+    return ParseAtomicRMW(Inst, PFS);
+  case lltok::kw_fence:
+    return ParseFence(Inst, PFS);
+  case lltok::kw_getelementptr:
+    return ParseGetElementPtr(Inst, PFS);
+  case lltok::kw_extractvalue:
+    return ParseExtractValue(Inst, PFS);
+  case lltok::kw_insertvalue:
+    return ParseInsertValue(Inst, PFS);
   }
 }
 
@@ -5827,37 +6176,91 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
 bool LLParser::ParseCmpPredicate(unsigned &P, unsigned Opc) {
   if (Opc == Instruction::FCmp) {
     switch (Lex.getKind()) {
-    default: return TokError("expected fcmp predicate (e.g. 'oeq')");
-    case lltok::kw_oeq: P = CmpInst::FCMP_OEQ; break;
-    case lltok::kw_one: P = CmpInst::FCMP_ONE; break;
-    case lltok::kw_olt: P = CmpInst::FCMP_OLT; break;
-    case lltok::kw_ogt: P = CmpInst::FCMP_OGT; break;
-    case lltok::kw_ole: P = CmpInst::FCMP_OLE; break;
-    case lltok::kw_oge: P = CmpInst::FCMP_OGE; break;
-    case lltok::kw_ord: P = CmpInst::FCMP_ORD; break;
-    case lltok::kw_uno: P = CmpInst::FCMP_UNO; break;
-    case lltok::kw_ueq: P = CmpInst::FCMP_UEQ; break;
-    case lltok::kw_une: P = CmpInst::FCMP_UNE; break;
-    case lltok::kw_ult: P = CmpInst::FCMP_ULT; break;
-    case lltok::kw_ugt: P = CmpInst::FCMP_UGT; break;
-    case lltok::kw_ule: P = CmpInst::FCMP_ULE; break;
-    case lltok::kw_uge: P = CmpInst::FCMP_UGE; break;
-    case lltok::kw_true: P = CmpInst::FCMP_TRUE; break;
-    case lltok::kw_false: P = CmpInst::FCMP_FALSE; break;
+    default:
+      return TokError("expected fcmp predicate (e.g. 'oeq')");
+    case lltok::kw_oeq:
+      P = CmpInst::FCMP_OEQ;
+      break;
+    case lltok::kw_one:
+      P = CmpInst::FCMP_ONE;
+      break;
+    case lltok::kw_olt:
+      P = CmpInst::FCMP_OLT;
+      break;
+    case lltok::kw_ogt:
+      P = CmpInst::FCMP_OGT;
+      break;
+    case lltok::kw_ole:
+      P = CmpInst::FCMP_OLE;
+      break;
+    case lltok::kw_oge:
+      P = CmpInst::FCMP_OGE;
+      break;
+    case lltok::kw_ord:
+      P = CmpInst::FCMP_ORD;
+      break;
+    case lltok::kw_uno:
+      P = CmpInst::FCMP_UNO;
+      break;
+    case lltok::kw_ueq:
+      P = CmpInst::FCMP_UEQ;
+      break;
+    case lltok::kw_une:
+      P = CmpInst::FCMP_UNE;
+      break;
+    case lltok::kw_ult:
+      P = CmpInst::FCMP_ULT;
+      break;
+    case lltok::kw_ugt:
+      P = CmpInst::FCMP_UGT;
+      break;
+    case lltok::kw_ule:
+      P = CmpInst::FCMP_ULE;
+      break;
+    case lltok::kw_uge:
+      P = CmpInst::FCMP_UGE;
+      break;
+    case lltok::kw_true:
+      P = CmpInst::FCMP_TRUE;
+      break;
+    case lltok::kw_false:
+      P = CmpInst::FCMP_FALSE;
+      break;
     }
   } else {
     switch (Lex.getKind()) {
-    default: return TokError("expected icmp predicate (e.g. 'eq')");
-    case lltok::kw_eq:  P = CmpInst::ICMP_EQ; break;
-    case lltok::kw_ne:  P = CmpInst::ICMP_NE; break;
-    case lltok::kw_slt: P = CmpInst::ICMP_SLT; break;
-    case lltok::kw_sgt: P = CmpInst::ICMP_SGT; break;
-    case lltok::kw_sle: P = CmpInst::ICMP_SLE; break;
-    case lltok::kw_sge: P = CmpInst::ICMP_SGE; break;
-    case lltok::kw_ult: P = CmpInst::ICMP_ULT; break;
-    case lltok::kw_ugt: P = CmpInst::ICMP_UGT; break;
-    case lltok::kw_ule: P = CmpInst::ICMP_ULE; break;
-    case lltok::kw_uge: P = CmpInst::ICMP_UGE; break;
+    default:
+      return TokError("expected icmp predicate (e.g. 'eq')");
+    case lltok::kw_eq:
+      P = CmpInst::ICMP_EQ;
+      break;
+    case lltok::kw_ne:
+      P = CmpInst::ICMP_NE;
+      break;
+    case lltok::kw_slt:
+      P = CmpInst::ICMP_SLT;
+      break;
+    case lltok::kw_sgt:
+      P = CmpInst::ICMP_SGT;
+      break;
+    case lltok::kw_sle:
+      P = CmpInst::ICMP_SLE;
+      break;
+    case lltok::kw_sge:
+      P = CmpInst::ICMP_SGE;
+      break;
+    case lltok::kw_ult:
+      P = CmpInst::ICMP_ULT;
+      break;
+    case lltok::kw_ugt:
+      P = CmpInst::ICMP_UGT;
+      break;
+    case lltok::kw_ule:
+      P = CmpInst::ICMP_ULE;
+      break;
+    case lltok::kw_uge:
+      P = CmpInst::ICMP_UGE;
+      break;
     }
   }
   Lex.Lex();
@@ -5875,25 +6278,27 @@ bool LLParser::ParseRet(Instruction *&Inst, BasicBlock *BB,
                         PerFunctionState &PFS) {
   SMLoc TypeLoc = Lex.getLoc();
   Type *Ty = nullptr;
-  if (ParseType(Ty, true /*void allowed*/)) return true;
+  if (ParseType(Ty, true /*void allowed*/))
+    return true;
 
   Type *ResType = PFS.getFunction().getReturnType();
 
   if (Ty->isVoidTy()) {
     if (!ResType->isVoidTy())
       return Error(TypeLoc, "value doesn't match function result type '" +
-                   getTypeString(ResType) + "'");
+                                getTypeString(ResType) + "'");
 
     Inst = ReturnInst::Create(Context);
     return false;
   }
 
   Value *RV;
-  if (ParseValue(Ty, RV, PFS)) return true;
+  if (ParseValue(Ty, RV, PFS))
+    return true;
 
   if (ResType != RV->getType())
     return Error(TypeLoc, "value doesn't match function result type '" +
-                 getTypeString(ResType) + "'");
+                              getTypeString(ResType) + "'");
 
   Inst = ReturnInst::Create(Context, RV);
   return false;
@@ -5906,7 +6311,8 @@ bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy Loc, Loc2;
   Value *Op0;
   BasicBlock *Op1, *Op2;
-  if (ParseTypeAndValue(Op0, Loc, PFS)) return true;
+  if (ParseTypeAndValue(Op0, Loc, PFS))
+    return true;
 
   if (BasicBlock *BB = dyn_cast<BasicBlock>(Op0)) {
     Inst = BranchInst::Create(BB);
@@ -5945,8 +6351,8 @@ bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(CondLoc, "switch condition must have integer type");
 
   // Parse the jump table pairs.
-  SmallPtrSet<Value*, 32> SeenCases;
-  SmallVector<std::pair<ConstantInt*, BasicBlock*>, 32> Table;
+  SmallPtrSet<Value *, 32> SeenCases;
+  SmallVector<std::pair<ConstantInt *, BasicBlock *>, 32> Table;
   while (Lex.getKind() != lltok::rsquare) {
     Value *Constant;
     BasicBlock *DestBB;
@@ -5964,7 +6370,7 @@ bool LLParser::ParseSwitch(Instruction *&Inst, PerFunctionState &PFS) {
     Table.push_back(std::make_pair(cast<ConstantInt>(Constant), DestBB));
   }
 
-  Lex.Lex();  // Eat the ']'.
+  Lex.Lex(); // Eat the ']'.
 
   SwitchInst *SI = SwitchInst::Create(Cond, DefaultBB, Table.size());
   for (unsigned i = 0, e = Table.size(); i != e; ++i)
@@ -5988,7 +6394,7 @@ bool LLParser::ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(AddrLoc, "indirectbr address must have pointer type");
 
   // Parse the destination list.
-  SmallVector<BasicBlock*, 16> DestList;
+  SmallVector<BasicBlock *, 16> DestList;
 
   if (Lex.getKind() != lltok::rsquare) {
     BasicBlock *DestBB;
@@ -6049,7 +6455,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   FunctionType *Ty = dyn_cast<FunctionType>(RetType);
   if (!Ty) {
     // Pull out the types of all of the arguments...
-    std::vector<Type*> ParamTypes;
+    std::vector<Type *> ParamTypes;
     for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
       ParamTypes.push_back(ArgList[i].V->getType());
 
@@ -6085,7 +6491,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 
     if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
-                   getTypeString(ExpectedTy) + "'");
+                                       getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
     ArgAttrs.push_back(ArgList[i].Attrs);
   }
@@ -6113,7 +6519,8 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 /// ParseResume
 ///   ::= 'resume' TypeAndValue
 bool LLParser::ParseResume(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Exn; LocTy ExnLoc;
+  Value *Exn;
+  LocTy ExnLoc;
   if (ParseTypeAndValue(Exn, ExnLoc, PFS))
     return true;
 
@@ -6150,7 +6557,7 @@ bool LLParser::ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
     Args.push_back(V);
   }
 
-  Lex.Lex();  // Lex the ']'.
+  Lex.Lex(); // Lex the ']'.
   return false;
 }
 
@@ -6197,7 +6604,7 @@ bool LLParser::ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS) {
   BasicBlock *BB;
   if (ParseToken(lltok::kw_to, "expected 'to' in catchret") ||
       ParseTypeAndBasicBlock(BB, PFS))
-      return true;
+    return true;
 
   Inst = CatchReturnInst::Create(CatchPad, BB);
   return false;
@@ -6232,8 +6639,7 @@ bool LLParser::ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS) {
   if (ParseToken(lltok::rsquare, "expected ']' after catchswitch labels"))
     return true;
 
-  if (ParseToken(lltok::kw_unwind,
-                 "expected 'unwind' after catchswitch scope"))
+  if (ParseToken(lltok::kw_unwind, "expected 'unwind' after catchswitch scope"))
     return true;
 
   BasicBlock *UnwindBB = nullptr;
@@ -6309,7 +6715,8 @@ bool LLParser::ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS) {
 /// operand is allowed.
 bool LLParser::ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS,
                             unsigned Opc, bool IsFP) {
-  LocTy Loc; Value *LHS;
+  LocTy Loc;
+  Value *LHS;
   if (ParseTypeAndValue(LHS, Loc, PFS))
     return true;
 
@@ -6431,9 +6838,8 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
       AttributeList::get(Context, AttributeSet::get(Context, FnAttrs),
                          AttributeSet::get(Context, RetAttrs), ArgAttrs);
 
-  CallBrInst *CBI =
-      CallBrInst::Create(Ty, Callee, DefaultDest, IndirectDests, Args,
-                         BundleList);
+  CallBrInst *CBI = CallBrInst::Create(Ty, Callee, DefaultDest, IndirectDests,
+                                       Args, BundleList);
   CBI->setCallingConv(CC);
   CBI->setAttributes(PAL);
   ForwardRefAttrGroups[CBI] = FwdRefAttrGrps;
@@ -6452,7 +6858,8 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
 /// operand is allowed.
 bool LLParser::ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS,
                                unsigned Opc, bool IsFP) {
-  LocTy Loc; Value *LHS, *RHS;
+  LocTy Loc;
+  Value *LHS, *RHS;
   if (ParseTypeAndValue(LHS, Loc, PFS) ||
       ParseToken(lltok::comma, "expected ',' in arithmetic operation") ||
       ParseValue(LHS->getType(), RHS, PFS))
@@ -6472,14 +6879,16 @@ bool LLParser::ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS,
 ///  ::= ArithmeticOps TypeAndValue ',' Value {
 bool LLParser::ParseLogical(Instruction *&Inst, PerFunctionState &PFS,
                             unsigned Opc) {
-  LocTy Loc; Value *LHS, *RHS;
+  LocTy Loc;
+  Value *LHS, *RHS;
   if (ParseTypeAndValue(LHS, Loc, PFS) ||
       ParseToken(lltok::comma, "expected ',' in logical operation") ||
       ParseValue(LHS->getType(), RHS, PFS))
     return true;
 
   if (!LHS->getType()->isIntOrIntVectorTy())
-    return Error(Loc,"instruction requires integer or integer vector operands");
+    return Error(Loc,
+                 "instruction requires integer or integer vector operands");
 
   Inst = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
   return false;
@@ -6494,8 +6903,7 @@ bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS,
   LocTy Loc;
   unsigned Pred;
   Value *LHS, *RHS;
-  if (ParseCmpPredicate(Pred, Opc) ||
-      ParseTypeAndValue(LHS, Loc, PFS) ||
+  if (ParseCmpPredicate(Pred, Opc) || ParseTypeAndValue(LHS, Loc, PFS) ||
       ParseToken(lltok::comma, "expected ',' after compare value") ||
       ParseValue(LHS->getType(), RHS, PFS))
     return true;
@@ -6518,7 +6926,6 @@ bool LLParser::ParseCompare(Instruction *&Inst, PerFunctionState &PFS,
 // Other Instructions.
 //===----------------------------------------------------------------------===//
 
-
 /// ParseCast
 ///   ::= CastOpc TypeAndValue 'to' Type
 bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS,
@@ -6534,8 +6941,8 @@ bool LLParser::ParseCast(Instruction *&Inst, PerFunctionState &PFS,
   if (!CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy)) {
     CastInst::castIsValid((Instruction::CastOps)Opc, Op, DestTy);
     return Error(Loc, "invalid cast opcode for cast from '" +
-                 getTypeString(Op->getType()) + "' to '" +
-                 getTypeString(DestTy) + "'");
+                          getTypeString(Op->getType()) + "' to '" +
+                          getTypeString(DestTy) + "'");
   }
   Inst = CastInst::Create((Instruction::CastOps)Opc, Op, DestTy);
   return false;
@@ -6636,7 +7043,8 @@ bool LLParser::ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS) {
 /// ParsePHI
 ///   ::= 'phi' Type '[' Value ',' Value ']' (',' '[' Value ',' Value ']')*
 int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
-  Type *Ty = nullptr;  LocTy TypeLoc;
+  Type *Ty = nullptr;
+  LocTy TypeLoc;
   Value *Op0, *Op1;
 
   if (ParseType(Ty, TypeLoc) ||
@@ -6648,7 +7056,7 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
     return true;
 
   bool AteExtraComma = false;
-  SmallVector<std::pair<Value*, BasicBlock*>, 16> PHIVals;
+  SmallVector<std::pair<Value *, BasicBlock *>, 16> PHIVals;
 
   while (true) {
     PHIVals.push_back(std::make_pair(Op0, cast<BasicBlock>(Op1)));
@@ -6686,7 +7094,8 @@ int LLParser::ParsePHI(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'filter'
 ///   ::= 'filter' TypeAndValue ( ',' TypeAndValue )*
 bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
-  Type *Ty = nullptr; LocTy TyLoc;
+  Type *Ty = nullptr;
+  LocTy TyLoc;
 
   if (ParseType(Ty, TyLoc))
     return true;
@@ -6694,7 +7103,8 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
   std::unique_ptr<LandingPadInst> LP(LandingPadInst::Create(Ty, 0));
   LP->setCleanup(EatIfPresent(lltok::kw_cleanup));
 
-  while (Lex.getKind() == lltok::kw_catch || Lex.getKind() == lltok::kw_filter){
+  while (Lex.getKind() == lltok::kw_catch ||
+         Lex.getKind() == lltok::kw_filter) {
     LandingPadInst::ClauseType CT;
     if (EatIfPresent(lltok::kw_catch))
       CT = LandingPadInst::Catch;
@@ -6778,7 +7188,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   FunctionType *Ty = dyn_cast<FunctionType>(RetType);
   if (!Ty) {
     // Pull out the types of all of the arguments...
-    std::vector<Type*> ParamTypes;
+    std::vector<Type *> ParamTypes;
     for (unsigned i = 0, e = ArgList.size(); i != e; ++i)
       ParamTypes.push_back(ArgList[i].V->getType());
 
@@ -6799,7 +7209,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   // Set up the Attribute for the function.
   SmallVector<AttributeSet, 8> Attrs;
 
-  SmallVector<Value*, 8> Args;
+  SmallVector<Value *, 8> Args;
 
   // Loop through FunctionType's arguments and ensure they are specified
   // correctly.  Also, gather any parameter attributes.
@@ -6815,7 +7225,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
     if (ExpectedTy && ExpectedTy != ArgList[i].V->getType())
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
-                   getTypeString(ExpectedTy) + "'");
+                                       getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
     Attrs.push_back(ArgList[i].Attrs);
   }
@@ -6859,7 +7269,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
   bool IsSwiftError = EatIfPresent(lltok::kw_swifterror);
 
-  if (ParseType(Ty, TyLoc)) return true;
+  if (ParseType(Ty, TyLoc))
+    return true;
 
   if (Ty->isFunctionTy() || !PointerType::isValidElementType(Ty))
     return Error(TyLoc, "invalid type for alloca");
@@ -6912,7 +7323,8 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'load' 'atomic' 'volatile'? TypeAndValue
 ///       'singlethread'? AtomicOrdering (',' 'align' i32)?
 int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Val; LocTy Loc;
+  Value *Val;
+  LocTy Loc;
   unsigned Alignment = 0;
   bool AteExtraComma = false;
   bool isAtomic = false;
@@ -6961,7 +7373,8 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'store' 'atomic' 'volatile'? TypeAndValue ',' TypeAndValue
 ///       'singlethread'? AtomicOrdering (',' 'align' i32)?
 int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Val, *Ptr; LocTy Loc, PtrLoc;
+  Value *Val, *Ptr;
+  LocTy Loc, PtrLoc;
   unsigned Alignment = 0;
   bool AteExtraComma = false;
   bool isAtomic = false;
@@ -7006,7 +7419,8 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'cmpxchg' 'weak'? 'volatile'? TypeAndValue ',' TypeAndValue ','
 ///       TypeAndValue 'singlethread'? AtomicOrdering AtomicOrdering
 int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Ptr, *Cmp, *New; LocTy PtrLoc, CmpLoc, NewLoc;
+  Value *Ptr, *Cmp, *New;
+  LocTy PtrLoc, CmpLoc, NewLoc;
   bool AteExtraComma = false;
   AtomicOrdering SuccessOrdering = AtomicOrdering::NotAtomic;
   AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
@@ -7047,8 +7461,8 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(NewLoc, "new value and pointer type do not match");
   if (!New->getType()->isFirstClassType())
     return Error(NewLoc, "cmpxchg operand must be a first class value");
-  AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(
-      Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SSID);
+  AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering,
+                                                 FailureOrdering, SSID);
   CXI->setVolatile(isVolatile);
   CXI->setWeak(isWeak);
   Inst = CXI;
@@ -7059,7 +7473,8 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
 ///   ::= 'atomicrmw' 'volatile'? BinOp TypeAndValue ',' TypeAndValue
 ///       'singlethread'? AtomicOrdering
 int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Ptr, *Val; LocTy PtrLoc, ValLoc;
+  Value *Ptr, *Val;
+  LocTy PtrLoc, ValLoc;
   bool AteExtraComma = false;
   AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
   SyncScope::ID SSID = SyncScope::System;
@@ -7071,18 +7486,41 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
     isVolatile = true;
 
   switch (Lex.getKind()) {
-  default: return TokError("expected binary operation in atomicrmw");
-  case lltok::kw_xchg: Operation = AtomicRMWInst::Xchg; break;
-  case lltok::kw_add: Operation = AtomicRMWInst::Add; break;
-  case lltok::kw_sub: Operation = AtomicRMWInst::Sub; break;
-  case lltok::kw_and: Operation = AtomicRMWInst::And; break;
-  case lltok::kw_nand: Operation = AtomicRMWInst::Nand; break;
-  case lltok::kw_or: Operation = AtomicRMWInst::Or; break;
-  case lltok::kw_xor: Operation = AtomicRMWInst::Xor; break;
-  case lltok::kw_max: Operation = AtomicRMWInst::Max; break;
-  case lltok::kw_min: Operation = AtomicRMWInst::Min; break;
-  case lltok::kw_umax: Operation = AtomicRMWInst::UMax; break;
-  case lltok::kw_umin: Operation = AtomicRMWInst::UMin; break;
+  default:
+    return TokError("expected binary operation in atomicrmw");
+  case lltok::kw_xchg:
+    Operation = AtomicRMWInst::Xchg;
+    break;
+  case lltok::kw_add:
+    Operation = AtomicRMWInst::Add;
+    break;
+  case lltok::kw_sub:
+    Operation = AtomicRMWInst::Sub;
+    break;
+  case lltok::kw_and:
+    Operation = AtomicRMWInst::And;
+    break;
+  case lltok::kw_nand:
+    Operation = AtomicRMWInst::Nand;
+    break;
+  case lltok::kw_or:
+    Operation = AtomicRMWInst::Or;
+    break;
+  case lltok::kw_xor:
+    Operation = AtomicRMWInst::Xor;
+    break;
+  case lltok::kw_max:
+    Operation = AtomicRMWInst::Max;
+    break;
+  case lltok::kw_min:
+    Operation = AtomicRMWInst::Min;
+    break;
+  case lltok::kw_umax:
+    Operation = AtomicRMWInst::UMax;
+    break;
+  case lltok::kw_umin:
+    Operation = AtomicRMWInst::UMin;
+    break;
   case lltok::kw_fadd:
     Operation = AtomicRMWInst::FAdd;
     IsFP = true;
@@ -7092,7 +7530,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
     IsFP = true;
     break;
   }
-  Lex.Lex();  // Eat the operation.
+  Lex.Lex(); // Eat the operation.
 
   if (ParseTypeAndValue(Ptr, PtrLoc, PFS) ||
       ParseToken(lltok::comma, "expected ',' after atomicrmw address") ||
@@ -7110,21 +7548,21 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   if (Operation == AtomicRMWInst::Xchg) {
     if (!Val->getType()->isIntegerTy() &&
         !Val->getType()->isFloatingPointTy()) {
-      return Error(ValLoc, "atomicrmw " +
-                   AtomicRMWInst::getOperationName(Operation) +
-                   " operand must be an integer or floating point type");
+      return Error(ValLoc,
+                   "atomicrmw " + AtomicRMWInst::getOperationName(Operation) +
+                       " operand must be an integer or floating point type");
     }
   } else if (IsFP) {
     if (!Val->getType()->isFloatingPointTy()) {
       return Error(ValLoc, "atomicrmw " +
-                   AtomicRMWInst::getOperationName(Operation) +
-                   " operand must be a floating point type");
+                               AtomicRMWInst::getOperationName(Operation) +
+                               " operand must be a floating point type");
     }
   } else {
     if (!Val->getType()->isIntegerTy()) {
       return Error(ValLoc, "atomicrmw " +
-                   AtomicRMWInst::getOperationName(Operation) +
-                   " operand must be an integer");
+                               AtomicRMWInst::getOperationName(Operation) +
+                               " operand must be an integer");
     }
   }
 
@@ -7133,8 +7571,7 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
                          " integer");
 
-  AtomicRMWInst *RMWI =
-    new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID);
+  AtomicRMWInst *RMWI = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID);
   RMWI->setVolatile(isVolatile);
   Inst = RMWI;
   return AteExtraComma ? InstExtraComma : InstNormal;
@@ -7174,7 +7611,8 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
     return true;
 
   Type *BaseType = Ptr->getType();
-  PointerType *BasePointerType = dyn_cast<PointerType>(BaseType->getScalarType());
+  PointerType *BasePointerType =
+      dyn_cast<PointerType>(BaseType->getScalarType());
   if (!BasePointerType)
     return Error(Loc, "base of getelementptr must be a pointer");
 
@@ -7182,33 +7620,35 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(ExplicitTypeLoc,
                  "explicit pointee type doesn't match operand's pointee type");
 
-  SmallVector<Value*, 16> Indices;
+  SmallVector<Value *, 16> Indices;
   bool AteExtraComma = false;
   // GEP returns a vector of pointers if at least one of parameters is a vector.
   // All vector parameters should have the same vector width.
-  unsigned GEPWidth = BaseType->isVectorTy() ?
-    BaseType->getVectorNumElements() : 0;
+  unsigned GEPWidth =
+      BaseType->isVectorTy() ? BaseType->getVectorNumElements() : 0;
 
   while (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::MetadataVar) {
       AteExtraComma = true;
       break;
     }
-    if (ParseTypeAndValue(Val, EltLoc, PFS)) return true;
+    if (ParseTypeAndValue(Val, EltLoc, PFS))
+      return true;
     if (!Val->getType()->isIntOrIntVectorTy())
       return Error(EltLoc, "getelementptr index must be an integer");
 
     if (Val->getType()->isVectorTy()) {
       unsigned ValNumEl = Val->getType()->getVectorNumElements();
       if (GEPWidth && GEPWidth != ValNumEl)
-        return Error(EltLoc,
-          "getelementptr vector index has a wrong number of elements");
+        return Error(
+            EltLoc,
+            "getelementptr vector index has a wrong number of elements");
       GEPWidth = ValNumEl;
     }
     Indices.push_back(Val);
   }
 
-  SmallPtrSet<Type*, 4> Visited;
+  SmallPtrSet<Type *, 4> Visited;
   if (!Indices.empty() && !Ty->isSized(&Visited))
     return Error(Loc, "base element of getelementptr must be sized");
 
@@ -7223,7 +7663,8 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
 /// ParseExtractValue
 ///   ::= 'extractvalue' TypeAndValue (',' uint32)+
 int LLParser::ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Val; LocTy Loc;
+  Value *Val;
+  LocTy Loc;
   SmallVector<unsigned, 4> Indices;
   bool AteExtraComma;
   if (ParseTypeAndValue(Val, Loc, PFS) ||
@@ -7242,7 +7683,8 @@ int LLParser::ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS) {
 /// ParseInsertValue
 ///   ::= 'insertvalue' TypeAndValue ',' TypeAndValue (',' uint32)+
 int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
-  Value *Val0, *Val1; LocTy Loc0, Loc1;
+  Value *Val0, *Val1;
+  LocTy Loc0, Loc1;
   SmallVector<unsigned, 4> Indices;
   bool AteExtraComma;
   if (ParseTypeAndValue(Val0, Loc0, PFS) ||
@@ -7254,7 +7696,8 @@ int LLParser::ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS) {
   if (!Val0->getType()->isAggregateType())
     return Error(Loc0, "insertvalue operand must be aggregate type");
 
-  Type *IndexedType = ExtractValueInst::getIndexedType(Val0->getType(), Indices);
+  Type *IndexedType =
+      ExtractValueInst::getIndexedType(Val0->getType(), Indices);
   if (!IndexedType)
     return Error(Loc0, "invalid indices for insertvalue");
   if (IndexedType != Val1->getType())
@@ -7359,7 +7802,8 @@ bool LLParser::ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes) {
   if (Indexes.size() < 2)
     return Error(Loc, "expected >= 2 uselistorder indexes");
   if (Offset != 0 || Max >= Indexes.size())
-    return Error(Loc, "expected distinct uselistorder indexes in range [0, size)");
+    return Error(Loc,
+                 "expected distinct uselistorder indexes in range [0, size)");
   if (IsOrdered)
     return Error(Loc, "expected uselistorder indexes to change the order");
 
@@ -7408,7 +7852,8 @@ bool LLParser::ParseUseListOrderBB() {
   else
     return Error(Fn.Loc, "expected function name in uselistorder_bb");
   if (!GV)
-    return Error(Fn.Loc, "invalid function forward reference in uselistorder_bb");
+    return Error(Fn.Loc,
+                 "invalid function forward reference in uselistorder_bb");
   auto *F = dyn_cast<Function>(GV);
   if (!F)
     return Error(Fn.Loc, "expected function name in uselistorder_bb");
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLParser.h b/hpvm/llvm_patches/lib/AsmParser/LLParser.h
index 610e2e262008190fc3102c4833846c2f70abe712..bc1983232f0570d816a23bc1f92ce490a44dee59 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLParser.h
+++ b/hpvm/llvm_patches/lib/AsmParser/LLParser.h
@@ -26,587 +26,606 @@
 #include <map>
 
 namespace llvm {
-  class Module;
-  class OpaqueType;
-  class Function;
-  class Value;
-  class BasicBlock;
-  class Instruction;
-  class Constant;
-  class GlobalValue;
-  class Comdat;
-  class MDString;
-  class MDNode;
-  struct SlotMapping;
-  class StructType;
-
-  /// ValID - Represents a reference of a definition of some sort with no type.
-  /// There are several cases where we have to parse the value but where the
-  /// type can depend on later context.  This may either be a numeric reference
-  /// or a symbolic (%var) reference.  This is just a discriminated union.
-  struct ValID {
-    enum {
-      t_LocalID, t_GlobalID,           // ID in UIntVal.
-      t_LocalName, t_GlobalName,       // Name in StrVal.
-      t_APSInt, t_APFloat,             // Value in APSIntVal/APFloatVal.
-      t_Null, t_Undef, t_Zero, t_None, // No value.
-      t_EmptyArray,                    // No value:  []
-      t_Constant,                      // Value in ConstantVal.
-      t_InlineAsm,                     // Value in FTy/StrVal/StrVal2/UIntVal.
-      t_ConstantStruct,                // Value in ConstantStructElts.
-      t_PackedConstantStruct           // Value in ConstantStructElts.
-    } Kind = t_LocalID;
-
-    LLLexer::LocTy Loc;
-    unsigned UIntVal;
-    FunctionType *FTy = nullptr;
-    std::string StrVal, StrVal2;
-    APSInt APSIntVal;
-    APFloat APFloatVal{0.0};
-    Constant *ConstantVal;
-    std::unique_ptr<Constant *[]> ConstantStructElts;
-
-    ValID() = default;
-    ValID(const ValID &RHS)
-        : Kind(RHS.Kind), Loc(RHS.Loc), UIntVal(RHS.UIntVal), FTy(RHS.FTy),
-          StrVal(RHS.StrVal), StrVal2(RHS.StrVal2), APSIntVal(RHS.APSIntVal),
-          APFloatVal(RHS.APFloatVal), ConstantVal(RHS.ConstantVal) {
-      assert(!RHS.ConstantStructElts);
-    }
+class Module;
+class OpaqueType;
+class Function;
+class Value;
+class BasicBlock;
+class Instruction;
+class Constant;
+class GlobalValue;
+class Comdat;
+class MDString;
+class MDNode;
+struct SlotMapping;
+class StructType;
+
+/// ValID - Represents a reference of a definition of some sort with no type.
+/// There are several cases where we have to parse the value but where the
+/// type can depend on later context.  This may either be a numeric reference
+/// or a symbolic (%var) reference.  This is just a discriminated union.
+struct ValID {
+  enum {
+    t_LocalID,
+    t_GlobalID, // ID in UIntVal.
+    t_LocalName,
+    t_GlobalName, // Name in StrVal.
+    t_APSInt,
+    t_APFloat, // Value in APSIntVal/APFloatVal.
+    t_Null,
+    t_Undef,
+    t_Zero,
+    t_None,                // No value.
+    t_EmptyArray,          // No value:  []
+    t_Constant,            // Value in ConstantVal.
+    t_InlineAsm,           // Value in FTy/StrVal/StrVal2/UIntVal.
+    t_ConstantStruct,      // Value in ConstantStructElts.
+    t_PackedConstantStruct // Value in ConstantStructElts.
+  } Kind = t_LocalID;
+
+  LLLexer::LocTy Loc;
+  unsigned UIntVal;
+  FunctionType *FTy = nullptr;
+  std::string StrVal, StrVal2;
+  APSInt APSIntVal;
+  APFloat APFloatVal{0.0};
+  Constant *ConstantVal;
+  std::unique_ptr<Constant *[]> ConstantStructElts;
+
+  ValID() = default;
+  ValID(const ValID &RHS)
+      : Kind(RHS.Kind), Loc(RHS.Loc), UIntVal(RHS.UIntVal), FTy(RHS.FTy),
+        StrVal(RHS.StrVal), StrVal2(RHS.StrVal2), APSIntVal(RHS.APSIntVal),
+        APFloatVal(RHS.APFloatVal), ConstantVal(RHS.ConstantVal) {
+    assert(!RHS.ConstantStructElts);
+  }
+
+  bool operator<(const ValID &RHS) const {
+    if (Kind == t_LocalID || Kind == t_GlobalID)
+      return UIntVal < RHS.UIntVal;
+    assert((Kind == t_LocalName || Kind == t_GlobalName ||
+            Kind == t_ConstantStruct || Kind == t_PackedConstantStruct) &&
+           "Ordering not defined for this ValID kind yet");
+    return StrVal < RHS.StrVal;
+  }
+};
+
+class LLParser {
+public:
+  typedef LLLexer::LocTy LocTy;
+
+private:
+  LLVMContext &Context;
+  LLLexer Lex;
+  // Module being parsed, null if we are only parsing summary index.
+  Module *M;
+  // Summary index being parsed, null if we are only parsing Module.
+  ModuleSummaryIndex *Index;
+  SlotMapping *Slots;
+
+  // Instruction metadata resolution.  Each instruction can have a list of
+  // MDRef info associated with them.
+  //
+  // The simpler approach of just creating temporary MDNodes and then calling
+  // RAUW on them when the definition is processed doesn't work because some
+  // instruction metadata kinds, such as dbg, get stored in the IR in an
+  // "optimized" format which doesn't participate in the normal value use
+  // lists. This means that RAUW doesn't work, even on temporary MDNodes
+  // which otherwise support RAUW. Instead, we defer resolving MDNode
+  // references until the definitions have been processed.
+  struct MDRef {
+    SMLoc Loc;
+    unsigned MDKind, MDSlot;
+  };
 
-    bool operator<(const ValID &RHS) const {
-      if (Kind == t_LocalID || Kind == t_GlobalID)
-        return UIntVal < RHS.UIntVal;
-      assert((Kind == t_LocalName || Kind == t_GlobalName ||
-              Kind == t_ConstantStruct || Kind == t_PackedConstantStruct) &&
-             "Ordering not defined for this ValID kind yet");
-      return StrVal < RHS.StrVal;
+  SmallVector<Instruction *, 64> InstsWithTBAATag;
+
+  // Type resolution handling data structures.  The location is set when we
+  // have processed a use of the type but not a definition yet.
+  StringMap<std::pair<Type *, LocTy>> NamedTypes;
+  std::map<unsigned, std::pair<Type *, LocTy>> NumberedTypes;
+
+  std::map<unsigned, TrackingMDNodeRef> NumberedMetadata;
+  std::map<unsigned, std::pair<TempMDTuple, LocTy>> ForwardRefMDNodes;
+
+  // Global Value reference information.
+  std::map<std::string, std::pair<GlobalValue *, LocTy>> ForwardRefVals;
+  std::map<unsigned, std::pair<GlobalValue *, LocTy>> ForwardRefValIDs;
+  std::vector<GlobalValue *> NumberedVals;
+
+  // Comdat forward reference information.
+  std::map<std::string, LocTy> ForwardRefComdats;
+
+  // References to blockaddress.  The key is the function ValID, the value is
+  // a list of references to blocks in that function.
+  std::map<ValID, std::map<ValID, GlobalValue *>> ForwardRefBlockAddresses;
+  class PerFunctionState;
+  /// Reference to per-function state to allow basic blocks to be
+  /// forward-referenced by blockaddress instructions within the same
+  /// function.
+  PerFunctionState *BlockAddressPFS;
+
+  // Attribute builder reference information.
+  std::map<Value *, std::vector<unsigned>> ForwardRefAttrGroups;
+  std::map<unsigned, AttrBuilder> NumberedAttrBuilders;
+
+  // Summary global value reference information.
+  std::map<unsigned, std::vector<std::pair<ValueInfo *, LocTy>>>
+      ForwardRefValueInfos;
+  std::map<unsigned, std::vector<std::pair<AliasSummary *, LocTy>>>
+      ForwardRefAliasees;
+  std::vector<ValueInfo> NumberedValueInfos;
+
+  // Summary type id reference information.
+  std::map<unsigned, std::vector<std::pair<GlobalValue::GUID *, LocTy>>>
+      ForwardRefTypeIds;
+
+  // Map of module ID to path.
+  std::map<unsigned, StringRef> ModuleIdMap;
+
+  /// Only the llvm-as tool may set this to false to bypass
+  /// UpgradeDebuginfo so it can generate broken bitcode.
+  bool UpgradeDebugInfo;
+
+  /// DataLayout string to override that in LLVM assembly.
+  StringRef DataLayoutStr;
+
+  std::string SourceFileName;
+
+public:
+  LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
+           ModuleSummaryIndex *Index, LLVMContext &Context,
+           SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true,
+           StringRef DataLayoutString = "")
+      : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index),
+        Slots(Slots), BlockAddressPFS(nullptr),
+        UpgradeDebugInfo(UpgradeDebugInfo), DataLayoutStr(DataLayoutString) {
+    if (!DataLayoutStr.empty())
+      M->setDataLayout(DataLayoutStr);
+  }
+  bool Run();
+
+  bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots);
+
+  bool parseTypeAtBeginning(Type *&Ty, unsigned &Read,
+                            const SlotMapping *Slots);
+
+  LLVMContext &getContext() { return Context; }
+
+private:
+  bool Error(LocTy L, const Twine &Msg) const { return Lex.Error(L, Msg); }
+  bool TokError(const Twine &Msg) const { return Error(Lex.getLoc(), Msg); }
+
+  /// Restore the internal name and slot mappings using the mappings that
+  /// were created at an earlier parsing stage.
+  void restoreParsingState(const SlotMapping *Slots);
+
+  /// GetGlobalVal - Get a value with the specified name or ID, creating a
+  /// forward reference record if needed.  This can return null if the value
+  /// exists but does not have the right type.
+  GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc,
+                            bool IsCall);
+  GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
+
+  /// Get a Comdat with the specified name, creating a forward reference
+  /// record if needed.
+  Comdat *getComdat(const std::string &Name, LocTy Loc);
+
+  // Helper Routines.
+  bool ParseToken(lltok::Kind T, const char *ErrMsg);
+  bool EatIfPresent(lltok::Kind T) {
+    if (Lex.getKind() != T)
+      return false;
+    Lex.Lex();
+    return true;
+  }
+
+  FastMathFlags EatFastMathFlagsIfPresent() {
+    FastMathFlags FMF;
+    while (true)
+      switch (Lex.getKind()) {
+      case lltok::kw_fast:
+        FMF.setFast();
+        Lex.Lex();
+        continue;
+      case lltok::kw_nnan:
+        FMF.setNoNaNs();
+        Lex.Lex();
+        continue;
+      case lltok::kw_ninf:
+        FMF.setNoInfs();
+        Lex.Lex();
+        continue;
+      case lltok::kw_nsz:
+        FMF.setNoSignedZeros();
+        Lex.Lex();
+        continue;
+      case lltok::kw_arcp:
+        FMF.setAllowReciprocal();
+        Lex.Lex();
+        continue;
+      case lltok::kw_contract:
+        FMF.setAllowContract(true);
+        Lex.Lex();
+        continue;
+      case lltok::kw_reassoc:
+        FMF.setAllowReassoc();
+        Lex.Lex();
+        continue;
+      case lltok::kw_afn:
+        FMF.setApproxFunc();
+        Lex.Lex();
+        continue;
+      default:
+        return FMF;
+      }
+    return FMF;
+  }
+
+  bool ParseOptionalToken(lltok::Kind T, bool &Present, LocTy *Loc = nullptr) {
+    if (Lex.getKind() != T) {
+      Present = false;
+    } else {
+      if (Loc)
+        *Loc = Lex.getLoc();
+      Lex.Lex();
+      Present = true;
     }
+    return false;
+  }
+  bool ParseStringConstant(std::string &Result);
+  bool ParseUInt32(unsigned &Val);
+  bool ParseUInt32(unsigned &Val, LocTy &Loc) {
+    Loc = Lex.getLoc();
+    return ParseUInt32(Val);
+  }
+  bool ParseUInt64(uint64_t &Val);
+  bool ParseUInt64(uint64_t &Val, LocTy &Loc) {
+    Loc = Lex.getLoc();
+    return ParseUInt64(Val);
+  }
+  bool ParseFlag(unsigned &Val);
+
+  bool ParseStringAttribute(AttrBuilder &B);
+
+  bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
+  bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
+  bool ParseOptionalUnnamedAddr(GlobalVariable::UnnamedAddr &UnnamedAddr);
+  bool ParseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS = 0);
+  bool ParseOptionalProgramAddrSpace(unsigned &AddrSpace) {
+    return ParseOptionalAddrSpace(AddrSpace,
+                                  M->getDataLayout().getProgramAddressSpace());
   };
-
-  class LLParser {
-  public:
-    typedef LLLexer::LocTy LocTy;
-  private:
-    LLVMContext &Context;
-    LLLexer Lex;
-    // Module being parsed, null if we are only parsing summary index.
-    Module *M;
-    // Summary index being parsed, null if we are only parsing Module.
-    ModuleSummaryIndex *Index;
-    SlotMapping *Slots;
-
-    // Instruction metadata resolution.  Each instruction can have a list of
-    // MDRef info associated with them.
-    //
-    // The simpler approach of just creating temporary MDNodes and then calling
-    // RAUW on them when the definition is processed doesn't work because some
-    // instruction metadata kinds, such as dbg, get stored in the IR in an
-    // "optimized" format which doesn't participate in the normal value use
-    // lists. This means that RAUW doesn't work, even on temporary MDNodes
-    // which otherwise support RAUW. Instead, we defer resolving MDNode
-    // references until the definitions have been processed.
-    struct MDRef {
-      SMLoc Loc;
-      unsigned MDKind, MDSlot;
-    };
-
-    SmallVector<Instruction*, 64> InstsWithTBAATag;
-
-    // Type resolution handling data structures.  The location is set when we
-    // have processed a use of the type but not a definition yet.
-    StringMap<std::pair<Type*, LocTy> > NamedTypes;
-    std::map<unsigned, std::pair<Type*, LocTy> > NumberedTypes;
-
-    std::map<unsigned, TrackingMDNodeRef> NumberedMetadata;
-    std::map<unsigned, std::pair<TempMDTuple, LocTy>> ForwardRefMDNodes;
-
-    // Global Value reference information.
-    std::map<std::string, std::pair<GlobalValue*, LocTy> > ForwardRefVals;
-    std::map<unsigned, std::pair<GlobalValue*, LocTy> > ForwardRefValIDs;
-    std::vector<GlobalValue*> NumberedVals;
-
-    // Comdat forward reference information.
-    std::map<std::string, LocTy> ForwardRefComdats;
-
-    // References to blockaddress.  The key is the function ValID, the value is
-    // a list of references to blocks in that function.
-    std::map<ValID, std::map<ValID, GlobalValue *>> ForwardRefBlockAddresses;
-    class PerFunctionState;
-    /// Reference to per-function state to allow basic blocks to be
-    /// forward-referenced by blockaddress instructions within the same
-    /// function.
-    PerFunctionState *BlockAddressPFS;
-
-    // Attribute builder reference information.
-    std::map<Value*, std::vector<unsigned> > ForwardRefAttrGroups;
-    std::map<unsigned, AttrBuilder> NumberedAttrBuilders;
-
-    // Summary global value reference information.
-    std::map<unsigned, std::vector<std::pair<ValueInfo *, LocTy>>>
-        ForwardRefValueInfos;
-    std::map<unsigned, std::vector<std::pair<AliasSummary *, LocTy>>>
-        ForwardRefAliasees;
-    std::vector<ValueInfo> NumberedValueInfos;
-
-    // Summary type id reference information.
-    std::map<unsigned, std::vector<std::pair<GlobalValue::GUID *, LocTy>>>
-        ForwardRefTypeIds;
-
-    // Map of module ID to path.
-    std::map<unsigned, StringRef> ModuleIdMap;
-
-    /// Only the llvm-as tool may set this to false to bypass
-    /// UpgradeDebuginfo so it can generate broken bitcode.
-    bool UpgradeDebugInfo;
-
-    /// DataLayout string to override that in LLVM assembly.
-    StringRef DataLayoutStr;
-
-    std::string SourceFileName;
+  bool ParseOptionalParamAttrs(AttrBuilder &B);
+  bool ParseOptionalReturnAttrs(AttrBuilder &B);
+  bool ParseOptionalLinkage(unsigned &Res, bool &HasLinkage,
+                            unsigned &Visibility, unsigned &DLLStorageClass,
+                            bool &DSOLocal);
+  void ParseOptionalDSOLocal(bool &DSOLocal);
+  void ParseOptionalVisibility(unsigned &Res);
+  void ParseOptionalDLLStorageClass(unsigned &Res);
+  bool ParseOptionalCallingConv(unsigned &CC);
+  bool ParseOptionalAlignment(unsigned &Alignment);
+  bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
+  bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
+                             AtomicOrdering &Ordering);
+  bool ParseScope(SyncScope::ID &SSID);
+  bool ParseOrdering(AtomicOrdering &Ordering);
+  bool ParseOptionalStackAlignment(unsigned &Alignment);
+  bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
+  bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
+                                   bool &AteExtraComma);
+  bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
+  bool parseAllocSizeArguments(unsigned &BaseSizeArg,
+                               Optional<unsigned> &HowManyArg);
+  bool ParseIndexList(SmallVectorImpl<unsigned> &Indices, bool &AteExtraComma);
+  bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
+    bool AteExtraComma;
+    if (ParseIndexList(Indices, AteExtraComma))
+      return true;
+    if (AteExtraComma)
+      return TokError("expected index");
+    return false;
+  }
+
+  // Top-Level Entities
+  bool ParseTopLevelEntities();
+  bool ValidateEndOfModule();
+  bool ValidateEndOfIndex();
+  bool ParseTargetDefinition();
+  bool ParseModuleAsm();
+  bool ParseSourceFileName();
+  bool ParseDepLibs(); // FIXME: Remove in 4.0.
+  bool ParseUnnamedType();
+  bool ParseNamedType();
+  bool ParseDeclare();
+  bool ParseDefine();
+
+  bool ParseGlobalType(bool &IsConstant);
+  bool ParseUnnamedGlobal();
+  bool ParseNamedGlobal();
+  bool ParseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage,
+                   bool HasLinkage, unsigned Visibility,
+                   unsigned DLLStorageClass, bool DSOLocal,
+                   GlobalVariable::ThreadLocalMode TLM,
+                   GlobalVariable::UnnamedAddr UnnamedAddr);
+  bool parseIndirectSymbol(const std::string &Name, LocTy NameLoc, unsigned L,
+                           unsigned Visibility, unsigned DLLStorageClass,
+                           bool DSOLocal, GlobalVariable::ThreadLocalMode TLM,
+                           GlobalVariable::UnnamedAddr UnnamedAddr);
+  bool parseComdat();
+  bool ParseStandaloneMetadata();
+  bool ParseNamedMetadata();
+  bool ParseMDString(MDString *&Result);
+  bool ParseMDNodeID(MDNode *&Result);
+  bool ParseUnnamedAttrGrp();
+  bool ParseFnAttributeValuePairs(AttrBuilder &B,
+                                  std::vector<unsigned> &FwdRefAttrGrps,
+                                  bool inAttrGrp, LocTy &BuiltinLoc);
+  bool ParseByValWithOptionalType(Type *&Result);
+
+  // Module Summary Index Parsing.
+  bool SkipModuleSummaryEntry();
+  bool ParseSummaryEntry();
+  bool ParseModuleEntry(unsigned ID);
+  bool ParseModuleReference(StringRef &ModulePath);
+  bool ParseGVReference(ValueInfo &VI, unsigned &GVId);
+  bool ParseGVEntry(unsigned ID);
+  bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+  bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+  bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID);
+  bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags);
+  bool ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags);
+  bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags);
+  bool ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls);
+  bool ParseHotness(CalleeInfo::HotnessType &Hotness);
+  bool ParseOptionalTypeIdInfo(FunctionSummary::TypeIdInfo &TypeIdInfo);
+  bool ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests);
+  bool ParseVFuncIdList(lltok::Kind Kind,
+                        std::vector<FunctionSummary::VFuncId> &VFuncIdList);
+  bool
+  ParseConstVCallList(lltok::Kind Kind,
+                      std::vector<FunctionSummary::ConstVCall> &ConstVCallList);
+  using IdToIndexMapType =
+      std::map<unsigned, std::vector<std::pair<unsigned, LocTy>>>;
+  bool ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
+                       IdToIndexMapType &IdToIndexMap, unsigned Index);
+  bool ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
+                    IdToIndexMapType &IdToIndexMap, unsigned Index);
+  bool ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs);
+  bool ParseOptionalRefs(std::vector<ValueInfo> &Refs);
+  bool ParseTypeIdEntry(unsigned ID);
+  bool ParseTypeIdSummary(TypeIdSummary &TIS);
+  bool ParseTypeIdCompatibleVtableEntry(unsigned ID);
+  bool ParseTypeTestResolution(TypeTestResolution &TTRes);
+  bool ParseOptionalWpdResolutions(
+      std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap);
+  bool ParseWpdRes(WholeProgramDevirtResolution &WPDRes);
+  bool ParseOptionalResByArg(
+      std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg>
+          &ResByArg);
+  bool ParseArgs(std::vector<uint64_t> &Args);
+  void AddGlobalValueToIndex(std::string Name, GlobalValue::GUID,
+                             GlobalValue::LinkageTypes Linkage, unsigned ID,
+                             std::unique_ptr<GlobalValueSummary> Summary);
+
+  // Type Parsing.
+  bool ParseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);
+  bool ParseType(Type *&Result, bool AllowVoid = false) {
+    return ParseType(Result, "expected type", AllowVoid);
+  }
+  bool ParseType(Type *&Result, const Twine &Msg, LocTy &Loc,
+                 bool AllowVoid = false) {
+    Loc = Lex.getLoc();
+    return ParseType(Result, Msg, AllowVoid);
+  }
+  bool ParseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) {
+    Loc = Lex.getLoc();
+    return ParseType(Result, AllowVoid);
+  }
+  bool ParseAnonStructType(Type *&Result, bool Packed);
+  bool ParseStructBody(SmallVectorImpl<Type *> &Body);
+  bool ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
+                             std::pair<Type *, LocTy> &Entry, Type *&ResultTy);
+
+  bool ParseArrayVectorType(Type *&Result, bool isVector);
+  bool ParseFunctionType(Type *&Result);
+
+  // Function Semantic Analysis.
+  class PerFunctionState {
+    LLParser &P;
+    Function &F;
+    std::map<std::string, std::pair<Value *, LocTy>> ForwardRefVals;
+    std::map<unsigned, std::pair<Value *, LocTy>> ForwardRefValIDs;
+    std::vector<Value *> NumberedVals;
+
+    /// FunctionNumber - If this is an unnamed function, this is the slot
+    /// number of it, otherwise it is -1.
+    int FunctionNumber;
 
   public:
-    LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
-             ModuleSummaryIndex *Index, LLVMContext &Context,
-             SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true,
-             StringRef DataLayoutString = "")
-        : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index),
-          Slots(Slots), BlockAddressPFS(nullptr),
-          UpgradeDebugInfo(UpgradeDebugInfo), DataLayoutStr(DataLayoutString) {
-      if (!DataLayoutStr.empty())
-        M->setDataLayout(DataLayoutStr);
-    }
-    bool Run();
-
-    bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots);
+    PerFunctionState(LLParser &p, Function &f, int functionNumber);
+    ~PerFunctionState();
 
-    bool parseTypeAtBeginning(Type *&Ty, unsigned &Read,
-                              const SlotMapping *Slots);
+    Function &getFunction() const { return F; }
 
-    LLVMContext &getContext() { return Context; }
+    bool FinishFunction();
 
-  private:
-
-    bool Error(LocTy L, const Twine &Msg) const {
-      return Lex.Error(L, Msg);
-    }
-    bool TokError(const Twine &Msg) const {
-      return Error(Lex.getLoc(), Msg);
-    }
-
-    /// Restore the internal name and slot mappings using the mappings that
-    /// were created at an earlier parsing stage.
-    void restoreParsingState(const SlotMapping *Slots);
-
-    /// GetGlobalVal - Get a value with the specified name or ID, creating a
+    /// GetVal - Get a value with the specified name or ID, creating a
     /// forward reference record if needed.  This can return null if the value
     /// exists but does not have the right type.
-    GlobalValue *GetGlobalVal(const std::string &N, Type *Ty, LocTy Loc,
-                              bool IsCall);
-    GlobalValue *GetGlobalVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
-
-    /// Get a Comdat with the specified name, creating a forward reference
-    /// record if needed.
-    Comdat *getComdat(const std::string &Name, LocTy Loc);
-
-    // Helper Routines.
-    bool ParseToken(lltok::Kind T, const char *ErrMsg);
-    bool EatIfPresent(lltok::Kind T) {
-      if (Lex.getKind() != T) return false;
-      Lex.Lex();
-      return true;
-    }
-
-    FastMathFlags EatFastMathFlagsIfPresent() {
-      FastMathFlags FMF;
-      while (true)
-        switch (Lex.getKind()) {
-        case lltok::kw_fast: FMF.setFast();            Lex.Lex(); continue;
-        case lltok::kw_nnan: FMF.setNoNaNs();          Lex.Lex(); continue;
-        case lltok::kw_ninf: FMF.setNoInfs();          Lex.Lex(); continue;
-        case lltok::kw_nsz:  FMF.setNoSignedZeros();   Lex.Lex(); continue;
-        case lltok::kw_arcp: FMF.setAllowReciprocal(); Lex.Lex(); continue;
-        case lltok::kw_contract:
-          FMF.setAllowContract(true);
-          Lex.Lex();
-          continue;
-        case lltok::kw_reassoc: FMF.setAllowReassoc(); Lex.Lex(); continue;
-        case lltok::kw_afn:     FMF.setApproxFunc();   Lex.Lex(); continue;
-        default: return FMF;
-        }
-      return FMF;
-    }
-
-    bool ParseOptionalToken(lltok::Kind T, bool &Present,
-                            LocTy *Loc = nullptr) {
-      if (Lex.getKind() != T) {
-        Present = false;
-      } else {
-        if (Loc)
-          *Loc = Lex.getLoc();
-        Lex.Lex();
-        Present = true;
-      }
-      return false;
-    }
-    bool ParseStringConstant(std::string &Result);
-    bool ParseUInt32(unsigned &Val);
-    bool ParseUInt32(unsigned &Val, LocTy &Loc) {
-      Loc = Lex.getLoc();
-      return ParseUInt32(Val);
-    }
-    bool ParseUInt64(uint64_t &Val);
-    bool ParseUInt64(uint64_t &Val, LocTy &Loc) {
-      Loc = Lex.getLoc();
-      return ParseUInt64(Val);
-    }
-    bool ParseFlag(unsigned &Val);
-
-    bool ParseStringAttribute(AttrBuilder &B);
-
-    bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
-    bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
-    bool ParseOptionalUnnamedAddr(GlobalVariable::UnnamedAddr &UnnamedAddr);
-    bool ParseOptionalAddrSpace(unsigned &AddrSpace, unsigned DefaultAS = 0);
-    bool ParseOptionalProgramAddrSpace(unsigned &AddrSpace) {
-      return ParseOptionalAddrSpace(
-          AddrSpace, M->getDataLayout().getProgramAddressSpace());
-    };
-    bool ParseOptionalParamAttrs(AttrBuilder &B);
-    bool ParseOptionalReturnAttrs(AttrBuilder &B);
-    bool ParseOptionalLinkage(unsigned &Res, bool &HasLinkage,
-                              unsigned &Visibility, unsigned &DLLStorageClass,
-                              bool &DSOLocal);
-    void ParseOptionalDSOLocal(bool &DSOLocal);
-    void ParseOptionalVisibility(unsigned &Res);
-    void ParseOptionalDLLStorageClass(unsigned &Res);
-    bool ParseOptionalCallingConv(unsigned &CC);
-    bool ParseOptionalAlignment(unsigned &Alignment);
-    bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
-    bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
-                               AtomicOrdering &Ordering);
-    bool ParseScope(SyncScope::ID &SSID);
-    bool ParseOrdering(AtomicOrdering &Ordering);
-    bool ParseOptionalStackAlignment(unsigned &Alignment);
-    bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
-    bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
-                                     bool &AteExtraComma);
-    bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
-    bool parseAllocSizeArguments(unsigned &BaseSizeArg,
-                                 Optional<unsigned> &HowManyArg);
-    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices,
-                        bool &AteExtraComma);
-    bool ParseIndexList(SmallVectorImpl<unsigned> &Indices) {
-      bool AteExtraComma;
-      if (ParseIndexList(Indices, AteExtraComma)) return true;
-      if (AteExtraComma)
-        return TokError("expected index");
-      return false;
-    }
+    Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall);
+    Value *GetVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
 
-    // Top-Level Entities
-    bool ParseTopLevelEntities();
-    bool ValidateEndOfModule();
-    bool ValidateEndOfIndex();
-    bool ParseTargetDefinition();
-    bool ParseModuleAsm();
-    bool ParseSourceFileName();
-    bool ParseDepLibs();        // FIXME: Remove in 4.0.
-    bool ParseUnnamedType();
-    bool ParseNamedType();
-    bool ParseDeclare();
-    bool ParseDefine();
-
-    bool ParseGlobalType(bool &IsConstant);
-    bool ParseUnnamedGlobal();
-    bool ParseNamedGlobal();
-    bool ParseGlobal(const std::string &Name, LocTy NameLoc, unsigned Linkage,
-                     bool HasLinkage, unsigned Visibility,
-                     unsigned DLLStorageClass, bool DSOLocal,
-                     GlobalVariable::ThreadLocalMode TLM,
-                     GlobalVariable::UnnamedAddr UnnamedAddr);
-    bool parseIndirectSymbol(const std::string &Name, LocTy NameLoc,
-                             unsigned L, unsigned Visibility,
-                             unsigned DLLStorageClass, bool DSOLocal,
-                             GlobalVariable::ThreadLocalMode TLM,
-                             GlobalVariable::UnnamedAddr UnnamedAddr);
-    bool parseComdat();
-    bool ParseStandaloneMetadata();
-    bool ParseNamedMetadata();
-    bool ParseMDString(MDString *&Result);
-    bool ParseMDNodeID(MDNode *&Result);
-    bool ParseUnnamedAttrGrp();
-    bool ParseFnAttributeValuePairs(AttrBuilder &B,
-                                    std::vector<unsigned> &FwdRefAttrGrps,
-                                    bool inAttrGrp, LocTy &BuiltinLoc);
-    bool ParseByValWithOptionalType(Type *&Result);
-
-    // Module Summary Index Parsing.
-    bool SkipModuleSummaryEntry();
-    bool ParseSummaryEntry();
-    bool ParseModuleEntry(unsigned ID);
-    bool ParseModuleReference(StringRef &ModulePath);
-    bool ParseGVReference(ValueInfo &VI, unsigned &GVId);
-    bool ParseGVEntry(unsigned ID);
-    bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID);
-    bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
-    bool ParseAliasSummary(std::string Name, GlobalValue::GUID, unsigned ID);
-    bool ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags);
-    bool ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags);
-    bool ParseOptionalFFlags(FunctionSummary::FFlags &FFlags);
-    bool ParseOptionalCalls(std::vector<FunctionSummary::EdgeTy> &Calls);
-    bool ParseHotness(CalleeInfo::HotnessType &Hotness);
-    bool ParseOptionalTypeIdInfo(FunctionSummary::TypeIdInfo &TypeIdInfo);
-    bool ParseTypeTests(std::vector<GlobalValue::GUID> &TypeTests);
-    bool ParseVFuncIdList(lltok::Kind Kind,
-                          std::vector<FunctionSummary::VFuncId> &VFuncIdList);
-    bool ParseConstVCallList(
-        lltok::Kind Kind,
-        std::vector<FunctionSummary::ConstVCall> &ConstVCallList);
-    using IdToIndexMapType =
-        std::map<unsigned, std::vector<std::pair<unsigned, LocTy>>>;
-    bool ParseConstVCall(FunctionSummary::ConstVCall &ConstVCall,
-                         IdToIndexMapType &IdToIndexMap, unsigned Index);
-    bool ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
-                      IdToIndexMapType &IdToIndexMap, unsigned Index);
-    bool ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs);
-    bool ParseOptionalRefs(std::vector<ValueInfo> &Refs);
-    bool ParseTypeIdEntry(unsigned ID);
-    bool ParseTypeIdSummary(TypeIdSummary &TIS);
-    bool ParseTypeIdCompatibleVtableEntry(unsigned ID);
-    bool ParseTypeTestResolution(TypeTestResolution &TTRes);
-    bool ParseOptionalWpdResolutions(
-        std::map<uint64_t, WholeProgramDevirtResolution> &WPDResMap);
-    bool ParseWpdRes(WholeProgramDevirtResolution &WPDRes);
-    bool ParseOptionalResByArg(
-        std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg>
-            &ResByArg);
-    bool ParseArgs(std::vector<uint64_t> &Args);
-    void AddGlobalValueToIndex(std::string Name, GlobalValue::GUID,
-                               GlobalValue::LinkageTypes Linkage, unsigned ID,
-                               std::unique_ptr<GlobalValueSummary> Summary);
-
-    // Type Parsing.
-    bool ParseType(Type *&Result, const Twine &Msg, bool AllowVoid = false);
-    bool ParseType(Type *&Result, bool AllowVoid = false) {
-      return ParseType(Result, "expected type", AllowVoid);
-    }
-    bool ParseType(Type *&Result, const Twine &Msg, LocTy &Loc,
-                   bool AllowVoid = false) {
-      Loc = Lex.getLoc();
-      return ParseType(Result, Msg, AllowVoid);
-    }
-    bool ParseType(Type *&Result, LocTy &Loc, bool AllowVoid = false) {
-      Loc = Lex.getLoc();
-      return ParseType(Result, AllowVoid);
-    }
-    bool ParseAnonStructType(Type *&Result, bool Packed);
-    bool ParseStructBody(SmallVectorImpl<Type*> &Body);
-    bool ParseStructDefinition(SMLoc TypeLoc, StringRef Name,
-                               std::pair<Type*, LocTy> &Entry,
-                               Type *&ResultTy);
-
-    bool ParseArrayVectorType(Type *&Result, bool isVector);
-    bool ParseFunctionType(Type *&Result);
-
-    // Function Semantic Analysis.
-    class PerFunctionState {
-      LLParser &P;
-      Function &F;
-      std::map<std::string, std::pair<Value*, LocTy> > ForwardRefVals;
-      std::map<unsigned, std::pair<Value*, LocTy> > ForwardRefValIDs;
-      std::vector<Value*> NumberedVals;
-
-      /// FunctionNumber - If this is an unnamed function, this is the slot
-      /// number of it, otherwise it is -1.
-      int FunctionNumber;
-    public:
-      PerFunctionState(LLParser &p, Function &f, int functionNumber);
-      ~PerFunctionState();
-
-      Function &getFunction() const { return F; }
-
-      bool FinishFunction();
-
-      /// GetVal - Get a value with the specified name or ID, creating a
-      /// forward reference record if needed.  This can return null if the value
-      /// exists but does not have the right type.
-      Value *GetVal(const std::string &Name, Type *Ty, LocTy Loc, bool IsCall);
-      Value *GetVal(unsigned ID, Type *Ty, LocTy Loc, bool IsCall);
-
-      /// SetInstName - After an instruction is parsed and inserted into its
-      /// basic block, this installs its name.
-      bool SetInstName(int NameID, const std::string &NameStr, LocTy NameLoc,
-                       Instruction *Inst);
-
-      /// GetBB - Get a basic block with the specified name or ID, creating a
-      /// forward reference record if needed.  This can return null if the value
-      /// is not a BasicBlock.
-      BasicBlock *GetBB(const std::string &Name, LocTy Loc);
-      BasicBlock *GetBB(unsigned ID, LocTy Loc);
-
-      /// DefineBB - Define the specified basic block, which is either named or
-      /// unnamed.  If there is an error, this returns null otherwise it returns
-      /// the block being defined.
-      BasicBlock *DefineBB(const std::string &Name, int NameID, LocTy Loc);
-
-      bool resolveForwardRefBlockAddresses();
-    };
-
-    bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
-                             PerFunctionState *PFS, bool IsCall);
-
-    Value *checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
-                                  Value *Val, bool IsCall);
-
-    bool parseConstantValue(Type *Ty, Constant *&C);
-    bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
-    bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) {
-      return ParseValue(Ty, V, &PFS);
-    }
+    /// SetInstName - After an instruction is parsed and inserted into its
+    /// basic block, this installs its name.
+    bool SetInstName(int NameID, const std::string &NameStr, LocTy NameLoc,
+                     Instruction *Inst);
 
-    bool ParseValue(Type *Ty, Value *&V, LocTy &Loc,
-                    PerFunctionState &PFS) {
-      Loc = Lex.getLoc();
-      return ParseValue(Ty, V, &PFS);
-    }
+    /// GetBB - Get a basic block with the specified name or ID, creating a
+    /// forward reference record if needed.  This can return null if the value
+    /// is not a BasicBlock.
+    BasicBlock *GetBB(const std::string &Name, LocTy Loc);
+    BasicBlock *GetBB(unsigned ID, LocTy Loc);
 
-    bool ParseTypeAndValue(Value *&V, PerFunctionState *PFS);
-    bool ParseTypeAndValue(Value *&V, PerFunctionState &PFS) {
-      return ParseTypeAndValue(V, &PFS);
-    }
-    bool ParseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) {
-      Loc = Lex.getLoc();
-      return ParseTypeAndValue(V, PFS);
-    }
-    bool ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
-                                PerFunctionState &PFS);
-    bool ParseTypeAndBasicBlock(BasicBlock *&BB, PerFunctionState &PFS) {
-      LocTy Loc;
-      return ParseTypeAndBasicBlock(BB, Loc, PFS);
-    }
+    /// DefineBB - Define the specified basic block, which is either named or
+    /// unnamed.  If there is an error, this returns null otherwise it returns
+    /// the block being defined.
+    BasicBlock *DefineBB(const std::string &Name, int NameID, LocTy Loc);
 
+    bool resolveForwardRefBlockAddresses();
+  };
 
-    struct ParamInfo {
-      LocTy Loc;
-      Value *V;
-      AttributeSet Attrs;
-      ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
-          : Loc(loc), V(v), Attrs(attrs) {}
-    };
-    bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
-                            PerFunctionState &PFS,
-                            bool IsMustTailCall = false,
-                            bool InVarArgsFunc = false);
-
-    bool
-    ParseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList,
-                                PerFunctionState &PFS);
-
-    bool ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
-                            PerFunctionState &PFS);
-
-    // Constant Parsing.
-    bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr);
-    bool ParseGlobalValue(Type *Ty, Constant *&C);
-    bool ParseGlobalTypeAndValue(Constant *&V);
-    bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
-                                Optional<unsigned> *InRangeOp = nullptr);
-    bool parseOptionalComdat(StringRef GlobalName, Comdat *&C);
-    bool ParseMetadataAsValue(Value *&V, PerFunctionState &PFS);
-    bool ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
-                              PerFunctionState *PFS);
-    bool ParseMetadata(Metadata *&MD, PerFunctionState *PFS);
-    bool ParseMDTuple(MDNode *&MD, bool IsDistinct = false);
-    bool ParseMDNode(MDNode *&N);
-    bool ParseMDNodeTail(MDNode *&N);
-    bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts);
-    bool ParseMetadataAttachment(unsigned &Kind, MDNode *&MD);
-    bool ParseInstructionMetadata(Instruction &Inst);
-    bool ParseGlobalObjectMetadataAttachment(GlobalObject &GO);
-    bool ParseOptionalFunctionMetadata(Function &F);
-
-    template <class FieldTy>
-    bool ParseMDField(LocTy Loc, StringRef Name, FieldTy &Result);
-    template <class FieldTy> bool ParseMDField(StringRef Name, FieldTy &Result);
-    template <class ParserTy>
-    bool ParseMDFieldsImplBody(ParserTy parseField);
-    template <class ParserTy>
-    bool ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc);
-    bool ParseSpecializedMDNode(MDNode *&N, bool IsDistinct = false);
+  bool ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
+                           PerFunctionState *PFS, bool IsCall);
+
+  Value *checkValidVariableType(LocTy Loc, const Twine &Name, Type *Ty,
+                                Value *Val, bool IsCall);
+
+  bool parseConstantValue(Type *Ty, Constant *&C);
+  bool ParseValue(Type *Ty, Value *&V, PerFunctionState *PFS);
+  bool ParseValue(Type *Ty, Value *&V, PerFunctionState &PFS) {
+    return ParseValue(Ty, V, &PFS);
+  }
+
+  bool ParseValue(Type *Ty, Value *&V, LocTy &Loc, PerFunctionState &PFS) {
+    Loc = Lex.getLoc();
+    return ParseValue(Ty, V, &PFS);
+  }
+
+  bool ParseTypeAndValue(Value *&V, PerFunctionState *PFS);
+  bool ParseTypeAndValue(Value *&V, PerFunctionState &PFS) {
+    return ParseTypeAndValue(V, &PFS);
+  }
+  bool ParseTypeAndValue(Value *&V, LocTy &Loc, PerFunctionState &PFS) {
+    Loc = Lex.getLoc();
+    return ParseTypeAndValue(V, PFS);
+  }
+  bool ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
+                              PerFunctionState &PFS);
+  bool ParseTypeAndBasicBlock(BasicBlock *&BB, PerFunctionState &PFS) {
+    LocTy Loc;
+    return ParseTypeAndBasicBlock(BB, Loc, PFS);
+  }
+
+  struct ParamInfo {
+    LocTy Loc;
+    Value *V;
+    AttributeSet Attrs;
+    ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
+        : Loc(loc), V(v), Attrs(attrs) {}
+  };
+  bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
+                          PerFunctionState &PFS, bool IsMustTailCall = false,
+                          bool InVarArgsFunc = false);
+
+  bool
+  ParseOptionalOperandBundles(SmallVectorImpl<OperandBundleDef> &BundleList,
+                              PerFunctionState &PFS);
+
+  bool ParseExceptionArgs(SmallVectorImpl<Value *> &Args,
+                          PerFunctionState &PFS);
+
+  // Constant Parsing.
+  bool ParseValID(ValID &ID, PerFunctionState *PFS = nullptr);
+  bool ParseGlobalValue(Type *Ty, Constant *&C);
+  bool ParseGlobalTypeAndValue(Constant *&V);
+  bool ParseGlobalValueVector(SmallVectorImpl<Constant *> &Elts,
+                              Optional<unsigned> *InRangeOp = nullptr);
+  bool parseOptionalComdat(StringRef GlobalName, Comdat *&C);
+  bool ParseMetadataAsValue(Value *&V, PerFunctionState &PFS);
+  bool ParseValueAsMetadata(Metadata *&MD, const Twine &TypeMsg,
+                            PerFunctionState *PFS);
+  bool ParseMetadata(Metadata *&MD, PerFunctionState *PFS);
+  bool ParseMDTuple(MDNode *&MD, bool IsDistinct = false);
+  bool ParseMDNode(MDNode *&N);
+  bool ParseMDNodeTail(MDNode *&N);
+  bool ParseMDNodeVector(SmallVectorImpl<Metadata *> &Elts);
+  bool ParseMetadataAttachment(unsigned &Kind, MDNode *&MD);
+  bool ParseInstructionMetadata(Instruction &Inst);
+  bool ParseGlobalObjectMetadataAttachment(GlobalObject &GO);
+  bool ParseOptionalFunctionMetadata(Function &F);
+
+  template <class FieldTy>
+  bool ParseMDField(LocTy Loc, StringRef Name, FieldTy &Result);
+  template <class FieldTy> bool ParseMDField(StringRef Name, FieldTy &Result);
+  template <class ParserTy> bool ParseMDFieldsImplBody(ParserTy parseField);
+  template <class ParserTy>
+  bool ParseMDFieldsImpl(ParserTy parseField, LocTy &ClosingLoc);
+  bool ParseSpecializedMDNode(MDNode *&N, bool IsDistinct = false);
 
 #define HANDLE_SPECIALIZED_MDNODE_LEAF(CLASS)                                  \
   bool Parse##CLASS(MDNode *&Result, bool IsDistinct);
 #include "llvm/IR/Metadata.def"
 
-    // Function Parsing.
-    struct ArgInfo {
-      LocTy Loc;
-      Type *Ty;
-      AttributeSet Attrs;
-      std::string Name;
-      ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
-          : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
-    };
-    bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
-    bool ParseFunctionHeader(Function *&Fn, bool isDefine);
-    bool ParseFunctionBody(Function &Fn);
-    bool ParseBasicBlock(PerFunctionState &PFS);
-
-    enum TailCallType { TCT_None, TCT_Tail, TCT_MustTail };
-
-    // Instruction Parsing.  Each instruction parsing routine can return with a
-    // normal result, an error result, or return having eaten an extra comma.
-    enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 };
-    int ParseInstruction(Instruction *&Inst, BasicBlock *BB,
-                         PerFunctionState &PFS);
-    bool ParseCmpPredicate(unsigned &P, unsigned Opc);
-
-    bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS);
-    bool ParseBr(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseResume(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCallBr(Instruction *&Inst, PerFunctionState &PFS);
-
-    bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
-                      bool IsFP);
-    bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
-                         bool IsFP);
-    bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
-    bool ParseCompare(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
-    bool ParseCast(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
-    bool ParseSelect(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS);
-    int ParsePHI(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS);
-    bool ParseCall(Instruction *&Inst, PerFunctionState &PFS,
-                   CallInst::TailCallKind TCK);
-    int ParseAlloc(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseLoad(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseStore(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseFence(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS);
-    int ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS);
-
-    // Use-list order directives.
-    bool ParseUseListOrder(PerFunctionState *PFS = nullptr);
-    bool ParseUseListOrderBB();
-    bool ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes);
-    bool sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes, SMLoc Loc);
+  // Function Parsing.
+  struct ArgInfo {
+    LocTy Loc;
+    Type *Ty;
+    AttributeSet Attrs;
+    std::string Name;
+    ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
+        : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
   };
-} // End llvm namespace
+  bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
+  bool ParseFunctionHeader(Function *&Fn, bool isDefine);
+  bool ParseFunctionBody(Function &Fn);
+  bool ParseBasicBlock(PerFunctionState &PFS);
+
+  enum TailCallType { TCT_None, TCT_Tail, TCT_MustTail };
+
+  // Instruction Parsing.  Each instruction parsing routine can return with a
+  // normal result, an error result, or return having eaten an extra comma.
+  enum InstResult { InstNormal = 0, InstError = 1, InstExtraComma = 2 };
+  int ParseInstruction(Instruction *&Inst, BasicBlock *BB,
+                       PerFunctionState &PFS);
+  bool ParseCmpPredicate(unsigned &P, unsigned Opc);
+
+  bool ParseRet(Instruction *&Inst, BasicBlock *BB, PerFunctionState &PFS);
+  bool ParseBr(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseSwitch(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseInvoke(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseResume(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseCleanupRet(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseCatchRet(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseCallBr(Instruction *&Inst, PerFunctionState &PFS);
+
+  bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
+                    bool IsFP);
+  bool ParseArithmetic(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
+                       bool IsFP);
+  bool ParseLogical(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+  bool ParseCompare(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+  bool ParseCast(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc);
+  bool ParseSelect(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseVA_Arg(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseExtractElement(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseInsertElement(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseShuffleVector(Instruction *&Inst, PerFunctionState &PFS);
+  int ParsePHI(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS);
+  bool ParseCall(Instruction *&Inst, PerFunctionState &PFS,
+                 CallInst::TailCallKind TCK);
+  int ParseAlloc(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseLoad(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseStore(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseFence(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseExtractValue(Instruction *&Inst, PerFunctionState &PFS);
+  int ParseInsertValue(Instruction *&Inst, PerFunctionState &PFS);
+
+  // Use-list order directives.
+  bool ParseUseListOrder(PerFunctionState *PFS = nullptr);
+  bool ParseUseListOrderBB();
+  bool ParseUseListOrderIndexes(SmallVectorImpl<unsigned> &Indexes);
+  bool sortUseListOrder(Value *V, ArrayRef<unsigned> Indexes, SMLoc Loc);
+};
+} // namespace llvm
 
 #endif
diff --git a/hpvm/llvm_patches/lib/AsmParser/LLToken.h b/hpvm/llvm_patches/lib/AsmParser/LLToken.h
index 3c2eade04f928d737ad8d2408ab4c56825e34ab0..7f9816965b2a21ae3d23873ca789a22481b575fa 100644
--- a/hpvm/llvm_patches/lib/AsmParser/LLToken.h
+++ b/hpvm/llvm_patches/lib/AsmParser/LLToken.h
@@ -351,10 +351,10 @@ enum Kind {
   kw_insertvalue,
   kw_blockaddress,
 
-   // VISC parameter attributes
-   kw_in, 
-   kw_out,
-   kw_inout,
+  // VISC parameter attributes
+  kw_in,
+  kw_out,
+  kw_inout,
 
   // Metadata types.
   kw_distinct,
diff --git a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
index c6530f992bab22d36a1273f1b1c454970270c928..7eb289d5872713ef826174b1e691c6440d4dd43e 100644
--- a/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/hpvm/llvm_patches/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -20,8 +20,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Bitcode/LLVMBitCodes.h"
+#include "llvm/Bitstream/BitstreamReader.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
@@ -211,9 +211,9 @@ static Expected<std::string> readIdentificationBlock(BitstreamCursor &Stream) {
     case bitc::IDENTIFICATION_CODE_EPOCH: { // EPOCH: [epoch#]
       unsigned epoch = (unsigned)Record[0];
       if (epoch != bitc::BITCODE_CURRENT_EPOCH) {
-        return error(
-          Twine("Incompatible epoch: Bitcode '") + Twine(epoch) +
-          "' vs current: '" + Twine(bitc::BITCODE_CURRENT_EPOCH) + "'");
+        return error(Twine("Incompatible epoch: Bitcode '") + Twine(epoch) +
+                     "' vs current: '" + Twine(bitc::BITCODE_CURRENT_EPOCH) +
+                     "'");
       }
     }
     }
@@ -367,8 +367,9 @@ static Expected<std::string> readModuleTriple(BitstreamCursor &Stream) {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     switch (MaybeRecord.get()) {
-    default: break;  // Default behavior, ignore unknown content.
-    case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
+    default:
+      break; // Default behavior, ignore unknown content.
+    case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
@@ -493,7 +494,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
   std::vector<std::string> SectionTable;
   std::vector<std::string> GCTable;
 
-  std::vector<Type*> TypeList;
+  std::vector<Type *> TypeList;
   DenseMap<Function *, FunctionType *> FunctionTypes;
   BitcodeReaderValueList ValueList;
   Optional<MetadataLoader> MDLoader;
@@ -515,11 +516,11 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
 
   /// While parsing a function body, this is a list of the basic blocks for the
   /// function.
-  std::vector<BasicBlock*> FunctionBBs;
+  std::vector<BasicBlock *> FunctionBBs;
 
   // When reading the module header, this list is populated with functions that
   // have bodies later in the file.
-  std::vector<Function*> FunctionsWithBodies;
+  std::vector<Function *> FunctionsWithBodies;
 
   // When intrinsic functions are encountered which require upgrading they are
   // stored here with their replacement function.
@@ -535,7 +536,7 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
 
   /// When function bodies are initially scanned, this map contains info about
   /// where to find deferred function body in the stream.
-  DenseMap<Function*, uint64_t> DeferredFunctionInfo;
+  DenseMap<Function *, uint64_t> DeferredFunctionInfo;
 
   /// When Metadata block is initially scanned when parsing the module, we may
   /// choose to defer parsing of the metadata. This vector contains info about
@@ -597,9 +598,7 @@ private:
   /// type in the same address space if opaque pointers are being
   /// used, otherwise nop. This converts a bitcode-reader internal
   /// type into one suitable for use in a Value.
-  Type *flattenPointerTypes(Type *Ty) {
-    return Ty;
-  }
+  Type *flattenPointerTypes(Type *Ty) { return Ty; }
 
   /// Given a fully structured pointer type (i.e. not opaque), return
   /// the flattened form of its element, suitable for use in a Value.
@@ -636,13 +635,14 @@ private:
   }
 
   BasicBlock *getBasicBlock(unsigned ID) const {
-    if (ID >= FunctionBBs.size()) return nullptr; // Invalid ID
+    if (ID >= FunctionBBs.size())
+      return nullptr; // Invalid ID
     return FunctionBBs[ID];
   }
 
   AttributeList getAttributes(unsigned i) const {
-    if (i-1 < MAttributes.size())
-      return MAttributes[i-1];
+    if (i - 1 < MAttributes.size())
+      return MAttributes[i - 1];
     return AttributeList();
   }
 
@@ -652,7 +652,8 @@ private:
   bool getValueTypePair(SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
                         unsigned InstNum, Value *&ResVal,
                         Type **FullTy = nullptr) {
-    if (Slot == Record.size()) return true;
+    if (Slot == Record.size())
+      return true;
     unsigned ValNo = (unsigned)Record[Slot++];
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
@@ -696,7 +697,8 @@ private:
   /// error.
   Value *getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                   unsigned InstNum, Type *Ty) {
-    if (Slot == Record.size()) return nullptr;
+    if (Slot == Record.size())
+      return nullptr;
     unsigned ValNo = (unsigned)Record[Slot];
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
@@ -707,7 +709,8 @@ private:
   /// Like getValue, but decodes signed VBRs.
   Value *getValueSigned(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                         unsigned InstNum, Type *Ty) {
-    if (Slot == Record.size()) return nullptr;
+    if (Slot == Record.size())
+      return nullptr;
     unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
     // Adjust the ValNo, if it was encoded relative to the InstNum.
     if (UseRelativeIDs)
@@ -938,7 +941,7 @@ static GlobalValue::LinkageTypes getDecodedLinkage(unsigned Val) {
     return GlobalValue::PrivateLinkage; // Obsolete LinkerPrivateWeakLinkage
   case 15:
     return GlobalValue::ExternalLinkage; // Obsolete LinkOnceODRAutoHideLinkage
-  case 1: // Old value with implicit comdat.
+  case 1:                                // Old value with implicit comdat.
   case 16:
     return GlobalValue::WeakAnyLinkage;
   case 10: // Old value with implicit comdat.
@@ -979,7 +982,8 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
   bool Local = (RawFlags & 0x4);
   bool AutoHide = (RawFlags & 0x8);
 
-  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local, AutoHide);
+  return GlobalValueSummary::GVFlags(Linkage, NotEligibleToImport, Live, Local,
+                                     AutoHide);
 }
 
 // Decode the flags for GlobalVariable in the summary
@@ -991,9 +995,12 @@ static GlobalVarSummary::GVarFlags getDecodedGVarFlags(uint64_t RawFlags) {
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
   switch (Val) {
   default: // Map unknown visibilities to default.
-  case 0: return GlobalValue::DefaultVisibility;
-  case 1: return GlobalValue::HiddenVisibility;
-  case 2: return GlobalValue::ProtectedVisibility;
+  case 0:
+    return GlobalValue::DefaultVisibility;
+  case 1:
+    return GlobalValue::HiddenVisibility;
+  case 2:
+    return GlobalValue::ProtectedVisibility;
   }
 }
 
@@ -1001,56 +1008,83 @@ static GlobalValue::DLLStorageClassTypes
 getDecodedDLLStorageClass(unsigned Val) {
   switch (Val) {
   default: // Map unknown values to default.
-  case 0: return GlobalValue::DefaultStorageClass;
-  case 1: return GlobalValue::DLLImportStorageClass;
-  case 2: return GlobalValue::DLLExportStorageClass;
+  case 0:
+    return GlobalValue::DefaultStorageClass;
+  case 1:
+    return GlobalValue::DLLImportStorageClass;
+  case 2:
+    return GlobalValue::DLLExportStorageClass;
   }
 }
 
 static bool getDecodedDSOLocal(unsigned Val) {
-  switch(Val) {
+  switch (Val) {
   default: // Map unknown values to preemptable.
-  case 0:  return false;
-  case 1:  return true;
+  case 0:
+    return false;
+  case 1:
+    return true;
   }
 }
 
 static GlobalVariable::ThreadLocalMode getDecodedThreadLocalMode(unsigned Val) {
   switch (Val) {
-    case 0: return GlobalVariable::NotThreadLocal;
-    default: // Map unknown non-zero value to general dynamic.
-    case 1: return GlobalVariable::GeneralDynamicTLSModel;
-    case 2: return GlobalVariable::LocalDynamicTLSModel;
-    case 3: return GlobalVariable::InitialExecTLSModel;
-    case 4: return GlobalVariable::LocalExecTLSModel;
+  case 0:
+    return GlobalVariable::NotThreadLocal;
+  default: // Map unknown non-zero value to general dynamic.
+  case 1:
+    return GlobalVariable::GeneralDynamicTLSModel;
+  case 2:
+    return GlobalVariable::LocalDynamicTLSModel;
+  case 3:
+    return GlobalVariable::InitialExecTLSModel;
+  case 4:
+    return GlobalVariable::LocalExecTLSModel;
   }
 }
 
 static GlobalVariable::UnnamedAddr getDecodedUnnamedAddrType(unsigned Val) {
   switch (Val) {
-    default: // Map unknown to UnnamedAddr::None.
-    case 0: return GlobalVariable::UnnamedAddr::None;
-    case 1: return GlobalVariable::UnnamedAddr::Global;
-    case 2: return GlobalVariable::UnnamedAddr::Local;
+  default: // Map unknown to UnnamedAddr::None.
+  case 0:
+    return GlobalVariable::UnnamedAddr::None;
+  case 1:
+    return GlobalVariable::UnnamedAddr::Global;
+  case 2:
+    return GlobalVariable::UnnamedAddr::Local;
   }
 }
 
 static int getDecodedCastOpcode(unsigned Val) {
   switch (Val) {
-  default: return -1;
-  case bitc::CAST_TRUNC   : return Instruction::Trunc;
-  case bitc::CAST_ZEXT    : return Instruction::ZExt;
-  case bitc::CAST_SEXT    : return Instruction::SExt;
-  case bitc::CAST_FPTOUI  : return Instruction::FPToUI;
-  case bitc::CAST_FPTOSI  : return Instruction::FPToSI;
-  case bitc::CAST_UITOFP  : return Instruction::UIToFP;
-  case bitc::CAST_SITOFP  : return Instruction::SIToFP;
-  case bitc::CAST_FPTRUNC : return Instruction::FPTrunc;
-  case bitc::CAST_FPEXT   : return Instruction::FPExt;
-  case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
-  case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
-  case bitc::CAST_BITCAST : return Instruction::BitCast;
-  case bitc::CAST_ADDRSPACECAST: return Instruction::AddrSpaceCast;
+  default:
+    return -1;
+  case bitc::CAST_TRUNC:
+    return Instruction::Trunc;
+  case bitc::CAST_ZEXT:
+    return Instruction::ZExt;
+  case bitc::CAST_SEXT:
+    return Instruction::SExt;
+  case bitc::CAST_FPTOUI:
+    return Instruction::FPToUI;
+  case bitc::CAST_FPTOSI:
+    return Instruction::FPToSI;
+  case bitc::CAST_UITOFP:
+    return Instruction::UIToFP;
+  case bitc::CAST_SITOFP:
+    return Instruction::SIToFP;
+  case bitc::CAST_FPTRUNC:
+    return Instruction::FPTrunc;
+  case bitc::CAST_FPEXT:
+    return Instruction::FPExt;
+  case bitc::CAST_PTRTOINT:
+    return Instruction::PtrToInt;
+  case bitc::CAST_INTTOPTR:
+    return Instruction::IntToPtr;
+  case bitc::CAST_BITCAST:
+    return Instruction::BitCast;
+  case bitc::CAST_ADDRSPACECAST:
+    return Instruction::AddrSpaceCast;
   }
 }
 
@@ -1108,33 +1142,54 @@ static int getDecodedBinaryOpcode(unsigned Val, Type *Ty) {
 
 static AtomicRMWInst::BinOp getDecodedRMWOperation(unsigned Val) {
   switch (Val) {
-  default: return AtomicRMWInst::BAD_BINOP;
-  case bitc::RMW_XCHG: return AtomicRMWInst::Xchg;
-  case bitc::RMW_ADD: return AtomicRMWInst::Add;
-  case bitc::RMW_SUB: return AtomicRMWInst::Sub;
-  case bitc::RMW_AND: return AtomicRMWInst::And;
-  case bitc::RMW_NAND: return AtomicRMWInst::Nand;
-  case bitc::RMW_OR: return AtomicRMWInst::Or;
-  case bitc::RMW_XOR: return AtomicRMWInst::Xor;
-  case bitc::RMW_MAX: return AtomicRMWInst::Max;
-  case bitc::RMW_MIN: return AtomicRMWInst::Min;
-  case bitc::RMW_UMAX: return AtomicRMWInst::UMax;
-  case bitc::RMW_UMIN: return AtomicRMWInst::UMin;
-  case bitc::RMW_FADD: return AtomicRMWInst::FAdd;
-  case bitc::RMW_FSUB: return AtomicRMWInst::FSub;
+  default:
+    return AtomicRMWInst::BAD_BINOP;
+  case bitc::RMW_XCHG:
+    return AtomicRMWInst::Xchg;
+  case bitc::RMW_ADD:
+    return AtomicRMWInst::Add;
+  case bitc::RMW_SUB:
+    return AtomicRMWInst::Sub;
+  case bitc::RMW_AND:
+    return AtomicRMWInst::And;
+  case bitc::RMW_NAND:
+    return AtomicRMWInst::Nand;
+  case bitc::RMW_OR:
+    return AtomicRMWInst::Or;
+  case bitc::RMW_XOR:
+    return AtomicRMWInst::Xor;
+  case bitc::RMW_MAX:
+    return AtomicRMWInst::Max;
+  case bitc::RMW_MIN:
+    return AtomicRMWInst::Min;
+  case bitc::RMW_UMAX:
+    return AtomicRMWInst::UMax;
+  case bitc::RMW_UMIN:
+    return AtomicRMWInst::UMin;
+  case bitc::RMW_FADD:
+    return AtomicRMWInst::FAdd;
+  case bitc::RMW_FSUB:
+    return AtomicRMWInst::FSub;
   }
 }
 
 static AtomicOrdering getDecodedOrdering(unsigned Val) {
   switch (Val) {
-  case bitc::ORDERING_NOTATOMIC: return AtomicOrdering::NotAtomic;
-  case bitc::ORDERING_UNORDERED: return AtomicOrdering::Unordered;
-  case bitc::ORDERING_MONOTONIC: return AtomicOrdering::Monotonic;
-  case bitc::ORDERING_ACQUIRE: return AtomicOrdering::Acquire;
-  case bitc::ORDERING_RELEASE: return AtomicOrdering::Release;
-  case bitc::ORDERING_ACQREL: return AtomicOrdering::AcquireRelease;
+  case bitc::ORDERING_NOTATOMIC:
+    return AtomicOrdering::NotAtomic;
+  case bitc::ORDERING_UNORDERED:
+    return AtomicOrdering::Unordered;
+  case bitc::ORDERING_MONOTONIC:
+    return AtomicOrdering::Monotonic;
+  case bitc::ORDERING_ACQUIRE:
+    return AtomicOrdering::Acquire;
+  case bitc::ORDERING_RELEASE:
+    return AtomicOrdering::Release;
+  case bitc::ORDERING_ACQREL:
+    return AtomicOrdering::AcquireRelease;
   default: // Map unknown orderings to sequentially-consistent.
-  case bitc::ORDERING_SEQCST: return AtomicOrdering::SequentiallyConsistent;
+  case bitc::ORDERING_SEQCST:
+    return AtomicOrdering::SequentiallyConsistent;
   }
 }
 
@@ -1177,8 +1232,12 @@ static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
 
 static void upgradeDLLImportExportLinkage(GlobalValue *GV, unsigned Val) {
   switch (Val) {
-  case 5: GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass); break;
-  case 6: GV->setDLLStorageClass(GlobalValue::DLLExportStorageClass); break;
+  case 5:
+    GV->setDLLStorageClass(GlobalValue::DLLImportStorageClass);
+    break;
+  case 6:
+    GV->setDLLStorageClass(GlobalValue::DLLExportStorageClass);
+    break;
   }
 }
 
@@ -1217,61 +1276,116 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
   case Attribute::EndAttrKinds:
     llvm_unreachable("Synthetic enumerators which should never get here");
 
-  case Attribute::None:            return 0;
-  case Attribute::ZExt:            return 1 << 0;
-  case Attribute::SExt:            return 1 << 1;
-  case Attribute::NoReturn:        return 1 << 2;
-  case Attribute::InReg:           return 1 << 3;
-  case Attribute::StructRet:       return 1 << 4;
-  case Attribute::NoUnwind:        return 1 << 5;
-  case Attribute::NoAlias:         return 1 << 6;
-  case Attribute::ByVal:           return 1 << 7;
-  case Attribute::Nest:            return 1 << 8;
-  case Attribute::ReadNone:        return 1 << 9;
-  case Attribute::ReadOnly:        return 1 << 10;
-  case Attribute::NoInline:        return 1 << 11;
-  case Attribute::AlwaysInline:    return 1 << 12;
-  case Attribute::OptimizeForSize: return 1 << 13;
-  case Attribute::StackProtect:    return 1 << 14;
-  case Attribute::StackProtectReq: return 1 << 15;
-  case Attribute::Alignment:       return 31 << 16;
-  case Attribute::NoCapture:       return 1 << 21;
-  case Attribute::NoRedZone:       return 1 << 22;
-  case Attribute::NoImplicitFloat: return 1 << 23;
-  case Attribute::Naked:           return 1 << 24;
-  case Attribute::InlineHint:      return 1 << 25;
-  case Attribute::StackAlignment:  return 7 << 26;
-  case Attribute::ReturnsTwice:    return 1 << 29;
-  case Attribute::UWTable:         return 1 << 30;
-  case Attribute::NonLazyBind:     return 1U << 31;
-  case Attribute::SanitizeAddress: return 1ULL << 32;
-  case Attribute::MinSize:         return 1ULL << 33;
-  case Attribute::NoDuplicate:     return 1ULL << 34;
-  case Attribute::StackProtectStrong: return 1ULL << 35;
-  case Attribute::SanitizeThread:  return 1ULL << 36;
-  case Attribute::SanitizeMemory:  return 1ULL << 37;
-  case Attribute::NoBuiltin:       return 1ULL << 38;
-  case Attribute::Returned:        return 1ULL << 39;
-  case Attribute::Cold:            return 1ULL << 40;
-  case Attribute::Builtin:         return 1ULL << 41;
-  case Attribute::OptimizeNone:    return 1ULL << 42;
-  case Attribute::InAlloca:        return 1ULL << 43;
-  case Attribute::NonNull:         return 1ULL << 44;
-  case Attribute::JumpTable:       return 1ULL << 45;
-  case Attribute::Convergent:      return 1ULL << 46;
-  case Attribute::SafeStack:       return 1ULL << 47;
-  case Attribute::NoRecurse:       return 1ULL << 48;
-  case Attribute::InaccessibleMemOnly:         return 1ULL << 49;
-  case Attribute::InaccessibleMemOrArgMemOnly: return 1ULL << 50;
-  case Attribute::SwiftSelf:       return 1ULL << 51;
-  case Attribute::SwiftError:      return 1ULL << 52;
-  case Attribute::WriteOnly:       return 1ULL << 53;
-  case Attribute::Speculatable:    return 1ULL << 54;
-  case Attribute::StrictFP:        return 1ULL << 55;
-  case Attribute::SanitizeHWAddress: return 1ULL << 56;
-  case Attribute::NoCfCheck:       return 1ULL << 57;
-  case Attribute::OptForFuzzing:   return 1ULL << 58;
-  case Attribute::ShadowCallStack: return 1ULL << 59;
+  case Attribute::None:
+    return 0;
+  case Attribute::ZExt:
+    return 1 << 0;
+  case Attribute::SExt:
+    return 1 << 1;
+  case Attribute::NoReturn:
+    return 1 << 2;
+  case Attribute::InReg:
+    return 1 << 3;
+  case Attribute::StructRet:
+    return 1 << 4;
+  case Attribute::NoUnwind:
+    return 1 << 5;
+  case Attribute::NoAlias:
+    return 1 << 6;
+  case Attribute::ByVal:
+    return 1 << 7;
+  case Attribute::Nest:
+    return 1 << 8;
+  case Attribute::ReadNone:
+    return 1 << 9;
+  case Attribute::ReadOnly:
+    return 1 << 10;
+  case Attribute::NoInline:
+    return 1 << 11;
+  case Attribute::AlwaysInline:
+    return 1 << 12;
+  case Attribute::OptimizeForSize:
+    return 1 << 13;
+  case Attribute::StackProtect:
+    return 1 << 14;
+  case Attribute::StackProtectReq:
+    return 1 << 15;
+  case Attribute::Alignment:
+    return 31 << 16;
+  case Attribute::NoCapture:
+    return 1 << 21;
+  case Attribute::NoRedZone:
+    return 1 << 22;
+  case Attribute::NoImplicitFloat:
+    return 1 << 23;
+  case Attribute::Naked:
+    return 1 << 24;
+  case Attribute::InlineHint:
+    return 1 << 25;
+  case Attribute::StackAlignment:
+    return 7 << 26;
+  case Attribute::ReturnsTwice:
+    return 1 << 29;
+  case Attribute::UWTable:
+    return 1 << 30;
+  case Attribute::NonLazyBind:
+    return 1U << 31;
+  case Attribute::SanitizeAddress:
+    return 1ULL << 32;
+  case Attribute::MinSize:
+    return 1ULL << 33;
+  case Attribute::NoDuplicate:
+    return 1ULL << 34;
+  case Attribute::StackProtectStrong:
+    return 1ULL << 35;
+  case Attribute::SanitizeThread:
+    return 1ULL << 36;
+  case Attribute::SanitizeMemory:
+    return 1ULL << 37;
+  case Attribute::NoBuiltin:
+    return 1ULL << 38;
+  case Attribute::Returned:
+    return 1ULL << 39;
+  case Attribute::Cold:
+    return 1ULL << 40;
+  case Attribute::Builtin:
+    return 1ULL << 41;
+  case Attribute::OptimizeNone:
+    return 1ULL << 42;
+  case Attribute::InAlloca:
+    return 1ULL << 43;
+  case Attribute::NonNull:
+    return 1ULL << 44;
+  case Attribute::JumpTable:
+    return 1ULL << 45;
+  case Attribute::Convergent:
+    return 1ULL << 46;
+  case Attribute::SafeStack:
+    return 1ULL << 47;
+  case Attribute::NoRecurse:
+    return 1ULL << 48;
+  case Attribute::InaccessibleMemOnly:
+    return 1ULL << 49;
+  case Attribute::InaccessibleMemOrArgMemOnly:
+    return 1ULL << 50;
+  case Attribute::SwiftSelf:
+    return 1ULL << 51;
+  case Attribute::SwiftError:
+    return 1ULL << 52;
+  case Attribute::WriteOnly:
+    return 1ULL << 53;
+  case Attribute::Speculatable:
+    return 1ULL << 54;
+  case Attribute::StrictFP:
+    return 1ULL << 55;
+  case Attribute::SanitizeHWAddress:
+    return 1ULL << 56;
+  case Attribute::NoCfCheck:
+    return 1ULL << 57;
+  case Attribute::OptForFuzzing:
+    return 1ULL << 58;
+  case Attribute::ShadowCallStack:
+    return 1ULL << 59;
   case Attribute::SpeculativeLoadHardening:
     return 1ULL << 60;
   case Attribute::ImmArg:
@@ -1281,10 +1395,13 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
   case Attribute::NoFree:
     return 1ULL << 63;
 
-   // VISC Attributes
-  case Attribute::In:              return 3ULL << 0;
-  case Attribute::Out:             return 3ULL << 1;  
-  case Attribute::InOut:           return 3ULL << 2;
+    // VISC Attributes
+  case Attribute::In:
+    return 3ULL << 0;
+  case Attribute::Out:
+    return 3ULL << 1;
+  case Attribute::InOut:
+    return 3ULL << 2;
 
   case Attribute::NoSync:
     llvm_unreachable("nosync attribute not supported in raw format");
@@ -1310,22 +1427,20 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
 }
 
 static void addRawAttributeValue(AttrBuilder &B, uint64_t Val) {
-  if (!Val) return;
+  if (!Val)
+    return;
 
   for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
        I = Attribute::AttrKind(I + 1)) {
-    if (I == Attribute::SanitizeMemTag ||
-        I == Attribute::Dereferenceable ||
-        I == Attribute::DereferenceableOrNull ||
-        I == Attribute::ArgMemOnly ||
-        I == Attribute::AllocSize ||
-        I == Attribute::NoSync)
+    if (I == Attribute::SanitizeMemTag || I == Attribute::Dereferenceable ||
+        I == Attribute::DereferenceableOrNull || I == Attribute::ArgMemOnly ||
+        I == Attribute::AllocSize || I == Attribute::NoSync)
       continue;
     if (uint64_t A = (Val & getRawAttributeMask(I))) {
       if (I == Attribute::Alignment)
         B.addAlignmentAttr(1ULL << ((A >> 16) - 1));
       else if (I == Attribute::StackAlignment)
-        B.addStackAlignmentAttr(1ULL << ((A >> 26)-1));
+        B.addStackAlignmentAttr(1ULL << ((A >> 26) - 1));
       else
         B.addAttribute(I);
     }
@@ -1348,7 +1463,7 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
   if (Alignment)
     B.addAlignmentAttr(Alignment);
   addRawAttributeValue(B, ((EncodedAttrs & (0xfffffULL << 32)) >> 11) |
-                          (EncodedAttrs & 0xffff));
+                              (EncodedAttrs & 0xffff));
 }
 
 Error BitcodeReader::parseAttributeBlock() {
@@ -1386,7 +1501,7 @@ Error BitcodeReader::parseAttributeBlock() {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     switch (MaybeRecord.get()) {
-    default:  // Default behavior: ignore.
+    default: // Default behavior: ignore.
       break;
     case bitc::PARAMATTR_CODE_ENTRY_OLD: // ENTRY: [paramidx0, attr0, ...]
       // FIXME: Remove in 4.0.
@@ -1395,7 +1510,7 @@ Error BitcodeReader::parseAttributeBlock() {
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
-        decodeLLVMAttributesForBitcode(B, Record[i+1]);
+        decodeLLVMAttributesForBitcode(B, Record[i + 1]);
         Attrs.push_back(AttributeList::get(Context, Record[i], B));
       }
 
@@ -1599,7 +1714,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     switch (MaybeRecord.get()) {
-    default:  // Default behavior: ignore.
+    default: // Default behavior: ignore.
       break;
     case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
       if (Record.size() < 3)
@@ -1610,7 +1725,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
 
       AttrBuilder B;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
-        if (Record[i] == 0) {        // Enum attribute
+        if (Record[i] == 0) { // Enum attribute
           Attribute::AttrKind Kind;
           if (Error Err = parseAttrKind(Record[++i], &Kind))
             return Err;
@@ -1725,37 +1840,37 @@ Error BitcodeReader::parseTypeTableBody() {
         return error("Invalid record");
       TypeList.resize(Record[0]);
       continue;
-    case bitc::TYPE_CODE_VOID:      // VOID
+    case bitc::TYPE_CODE_VOID: // VOID
       ResultTy = Type::getVoidTy(Context);
       break;
-    case bitc::TYPE_CODE_HALF:     // HALF
+    case bitc::TYPE_CODE_HALF: // HALF
       ResultTy = Type::getHalfTy(Context);
       break;
-    case bitc::TYPE_CODE_FLOAT:     // FLOAT
+    case bitc::TYPE_CODE_FLOAT: // FLOAT
       ResultTy = Type::getFloatTy(Context);
       break;
-    case bitc::TYPE_CODE_DOUBLE:    // DOUBLE
+    case bitc::TYPE_CODE_DOUBLE: // DOUBLE
       ResultTy = Type::getDoubleTy(Context);
       break;
-    case bitc::TYPE_CODE_X86_FP80:  // X86_FP80
+    case bitc::TYPE_CODE_X86_FP80: // X86_FP80
       ResultTy = Type::getX86_FP80Ty(Context);
       break;
-    case bitc::TYPE_CODE_FP128:     // FP128
+    case bitc::TYPE_CODE_FP128: // FP128
       ResultTy = Type::getFP128Ty(Context);
       break;
     case bitc::TYPE_CODE_PPC_FP128: // PPC_FP128
       ResultTy = Type::getPPC_FP128Ty(Context);
       break;
-    case bitc::TYPE_CODE_LABEL:     // LABEL
+    case bitc::TYPE_CODE_LABEL: // LABEL
       ResultTy = Type::getLabelTy(Context);
       break;
-    case bitc::TYPE_CODE_METADATA:  // METADATA
+    case bitc::TYPE_CODE_METADATA: // METADATA
       ResultTy = Type::getMetadataTy(Context);
       break;
-    case bitc::TYPE_CODE_X86_MMX:   // X86_MMX
+    case bitc::TYPE_CODE_X86_MMX: // X86_MMX
       ResultTy = Type::getX86_MMXTy(Context);
       break;
-    case bitc::TYPE_CODE_TOKEN:     // TOKEN
+    case bitc::TYPE_CODE_TOKEN: // TOKEN
       ResultTy = Type::getTokenTy(Context);
       break;
     case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width]
@@ -1777,8 +1892,7 @@ Error BitcodeReader::parseTypeTableBody() {
       if (Record.size() == 2)
         AddressSpace = Record[1];
       ResultTy = getTypeByID(Record[0]);
-      if (!ResultTy ||
-          !PointerType::isValidElementType(ResultTy))
+      if (!ResultTy || !PointerType::isValidElementType(ResultTy))
         return error("Invalid type");
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
@@ -1788,7 +1902,7 @@ Error BitcodeReader::parseTypeTableBody() {
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
         return error("Invalid record");
-      SmallVector<Type*, 8> ArgTys;
+      SmallVector<Type *, 8> ArgTys;
       for (unsigned i = 3, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
           ArgTys.push_back(T);
@@ -1797,7 +1911,7 @@ Error BitcodeReader::parseTypeTableBody() {
       }
 
       ResultTy = getTypeByID(Record[2]);
-      if (!ResultTy || ArgTys.size() < Record.size()-3)
+      if (!ResultTy || ArgTys.size() < Record.size() - 3)
         return error("Invalid type");
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
@@ -1807,40 +1921,39 @@ Error BitcodeReader::parseTypeTableBody() {
       // FUNCTION: [vararg, retty, paramty x N]
       if (Record.size() < 2)
         return error("Invalid record");
-      SmallVector<Type*, 8> ArgTys;
+      SmallVector<Type *, 8> ArgTys;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i])) {
           if (!FunctionType::isValidArgumentType(T))
             return error("Invalid function argument type");
           ArgTys.push_back(T);
-        }
-        else
+        } else
           break;
       }
 
       ResultTy = getTypeByID(Record[1]);
-      if (!ResultTy || ArgTys.size() < Record.size()-2)
+      if (!ResultTy || ArgTys.size() < Record.size() - 2)
         return error("Invalid type");
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
-    case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
+    case bitc::TYPE_CODE_STRUCT_ANON: { // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
         return error("Invalid record");
-      SmallVector<Type*, 8> EltTys;
+      SmallVector<Type *, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
           EltTys.push_back(T);
         else
           break;
       }
-      if (EltTys.size() != Record.size()-1)
+      if (EltTys.size() != Record.size() - 1)
         return error("Invalid type");
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
-    case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
+    case bitc::TYPE_CODE_STRUCT_NAME: // STRUCT_NAME: [strchr x N]
       if (convertToString(Record, 0, TypeName))
         return error("Invalid record");
       continue;
@@ -1857,24 +1970,24 @@ Error BitcodeReader::parseTypeTableBody() {
       if (Res) {
         Res->setName(TypeName);
         TypeList[NumRecords] = nullptr;
-      } else  // Otherwise, create a new struct.
+      } else // Otherwise, create a new struct.
         Res = createIdentifiedStructType(Context, TypeName);
       TypeName.clear();
 
-      SmallVector<Type*, 8> EltTys;
+      SmallVector<Type *, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
           EltTys.push_back(T);
         else
           break;
       }
-      if (EltTys.size() != Record.size()-1)
+      if (EltTys.size() != Record.size() - 1)
         return error("Invalid record");
       Res->setBody(EltTys, Record[0]);
       ResultTy = Res;
       break;
     }
-    case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
+    case bitc::TYPE_CODE_OPAQUE: { // OPAQUE: []
       if (Record.size() != 1)
         return error("Invalid record");
 
@@ -1886,13 +1999,13 @@ Error BitcodeReader::parseTypeTableBody() {
       if (Res) {
         Res->setName(TypeName);
         TypeList[NumRecords] = nullptr;
-      } else  // Otherwise, create a new struct with no body.
+      } else // Otherwise, create a new struct with no body.
         Res = createIdentifiedStructType(Context, TypeName);
       TypeName.clear();
       ResultTy = Res;
       break;
     }
-    case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
+    case bitc::TYPE_CODE_ARRAY: // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
         return error("Invalid record");
       ResultTy = getTypeByID(Record[1]);
@@ -1900,8 +2013,8 @@ Error BitcodeReader::parseTypeTableBody() {
         return error("Invalid type");
       ResultTy = ArrayType::get(ResultTy, Record[0]);
       break;
-    case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty] or
-                                    //         [numelts, eltty, scalable]
+    case bitc::TYPE_CODE_VECTOR: // VECTOR: [numelts, eltty] or
+                                 //         [numelts, eltty, scalable]
       if (Record.size() < 2)
         return error("Invalid record");
       if (Record[0] == 0)
@@ -2183,9 +2296,9 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     switch (MaybeRecord.get()) {
-    default:  // Default behavior: unknown type.
+    default: // Default behavior: unknown type.
       break;
-    case bitc::VST_CODE_ENTRY: {  // VST_CODE_ENTRY: [valueid, namechar x N]
+    case bitc::VST_CODE_ENTRY: { // VST_CODE_ENTRY: [valueid, namechar x N]
       Expected<Value *> ValOrErr = recordValue(Record, 1, TT);
       if (Error Err = ValOrErr.takeError())
         return Err;
@@ -2320,8 +2433,7 @@ Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
 
 static APInt readWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   SmallVector<uint64_t, 8> Words(Vals.size());
-  transform(Vals, Words.begin(),
-                 BitcodeReader::decodeSignRotatedValue);
+  transform(Vals, Words.begin(), BitcodeReader::decodeSignRotatedValue);
 
   return APInt(TypeBits, Words);
 }
@@ -2368,11 +2480,11 @@ Error BitcodeReader::parseConstants() {
     if (!MaybeBitCode)
       return MaybeBitCode.takeError();
     switch (unsigned BitCode = MaybeBitCode.get()) {
-    default:  // Default behavior: unknown constant
-    case bitc::CST_CODE_UNDEF:     // UNDEF
+    default:                   // Default behavior: unknown constant
+    case bitc::CST_CODE_UNDEF: // UNDEF
       V = UndefValue::get(CurTy);
       break;
-    case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
+    case bitc::CST_CODE_SETTYPE: // SETTYPE: [typeid]
       if (Record.empty())
         return error("Invalid record");
       if (Record[0] >= TypeList.size() || !TypeList[Record[0]])
@@ -2381,16 +2493,16 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid constant type");
       CurFullTy = TypeList[Record[0]];
       CurTy = flattenPointerTypes(CurFullTy);
-      continue;  // Skip the ValueList manipulation.
-    case bitc::CST_CODE_NULL:      // NULL
+      continue;               // Skip the ValueList manipulation.
+    case bitc::CST_CODE_NULL: // NULL
       V = Constant::getNullValue(CurTy);
       break;
-    case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
+    case bitc::CST_CODE_INTEGER: // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
         return error("Invalid record");
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
-    case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
+    case bitc::CST_CODE_WIDE_INTEGER: { // WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
         return error("Invalid record");
 
@@ -2400,7 +2512,7 @@ Error BitcodeReader::parseConstants() {
 
       break;
     }
-    case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
+    case bitc::CST_CODE_FLOAT: { // FLOAT: [fpval]
       if (Record.empty())
         return error("Invalid record");
       if (CurTy->isHalfTy())
@@ -2410,8 +2522,8 @@ Error BitcodeReader::parseConstants() {
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(),
                                              APInt(32, (uint32_t)Record[0])));
       else if (CurTy->isDoubleTy())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEdouble(),
-                                             APInt(64, Record[0])));
+        V = ConstantFP::get(
+            Context, APFloat(APFloat::IEEEdouble(), APInt(64, Record[0])));
       else if (CurTy->isX86_FP80Ty()) {
         // Bits are not stored the same way as a normal i80 APInt, compensate.
         uint64_t Rearrange[2];
@@ -2420,27 +2532,27 @@ Error BitcodeReader::parseConstants() {
         V = ConstantFP::get(Context, APFloat(APFloat::x87DoubleExtended(),
                                              APInt(80, Rearrange)));
       } else if (CurTy->isFP128Ty())
-        V = ConstantFP::get(Context, APFloat(APFloat::IEEEquad(),
-                                             APInt(128, Record)));
+        V = ConstantFP::get(Context,
+                            APFloat(APFloat::IEEEquad(), APInt(128, Record)));
       else if (CurTy->isPPC_FP128Ty())
-        V = ConstantFP::get(Context, APFloat(APFloat::PPCDoubleDouble(),
-                                             APInt(128, Record)));
+        V = ConstantFP::get(
+            Context, APFloat(APFloat::PPCDoubleDouble(), APInt(128, Record)));
       else
         V = UndefValue::get(CurTy);
       break;
     }
 
-    case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
+    case bitc::CST_CODE_AGGREGATE: { // AGGREGATE: [n x value number]
       if (Record.empty())
         return error("Invalid record");
 
       unsigned Size = Record.size();
-      SmallVector<Constant*, 16> Elts;
+      SmallVector<Constant *, 16> Elts;
 
       if (StructType *STy = dyn_cast<StructType>(CurTy)) {
         for (unsigned i = 0; i != Size; ++i)
-          Elts.push_back(ValueList.getConstantFwdRef(Record[i],
-                                                     STy->getElementType(i)));
+          Elts.push_back(
+              ValueList.getConstantFwdRef(Record[i], STy->getElementType(i)));
         V = ConstantStruct::get(STy, Elts);
       } else if (ArrayType *ATy = dyn_cast<ArrayType>(CurTy)) {
         Type *EltTy = ATy->getElementType();
@@ -2467,7 +2579,7 @@ Error BitcodeReader::parseConstants() {
                                        BitCode == bitc::CST_CODE_CSTRING);
       break;
     }
-    case bitc::CST_CODE_DATA: {// DATA: [n x value]
+    case bitc::CST_CODE_DATA: { // DATA: [n x value]
       if (Record.empty())
         return error("Invalid record");
 
@@ -2519,12 +2631,12 @@ Error BitcodeReader::parseConstants() {
       }
       break;
     }
-    case bitc::CST_CODE_CE_UNOP: {  // CE_UNOP: [opcode, opval]
+    case bitc::CST_CODE_CE_UNOP: { // CE_UNOP: [opcode, opval]
       if (Record.size() < 2)
         return error("Invalid record");
       int Opc = getDecodedUnaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
-        V = UndefValue::get(CurTy);  // Unknown unop.
+        V = UndefValue::get(CurTy); // Unknown unop.
       } else {
         Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
         unsigned Flags = 0;
@@ -2532,29 +2644,25 @@ Error BitcodeReader::parseConstants() {
       }
       break;
     }
-    case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
+    case bitc::CST_CODE_CE_BINOP: { // CE_BINOP: [opcode, opval, opval]
       if (Record.size() < 3)
         return error("Invalid record");
       int Opc = getDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
-        V = UndefValue::get(CurTy);  // Unknown binop.
+        V = UndefValue::get(CurTy); // Unknown binop.
       } else {
         Constant *LHS = ValueList.getConstantFwdRef(Record[1], CurTy);
         Constant *RHS = ValueList.getConstantFwdRef(Record[2], CurTy);
         unsigned Flags = 0;
         if (Record.size() >= 4) {
-          if (Opc == Instruction::Add ||
-              Opc == Instruction::Sub ||
-              Opc == Instruction::Mul ||
-              Opc == Instruction::Shl) {
+          if (Opc == Instruction::Add || Opc == Instruction::Sub ||
+              Opc == Instruction::Mul || Opc == Instruction::Shl) {
             if (Record[3] & (1 << bitc::OBO_NO_SIGNED_WRAP))
               Flags |= OverflowingBinaryOperator::NoSignedWrap;
             if (Record[3] & (1 << bitc::OBO_NO_UNSIGNED_WRAP))
               Flags |= OverflowingBinaryOperator::NoUnsignedWrap;
-          } else if (Opc == Instruction::SDiv ||
-                     Opc == Instruction::UDiv ||
-                     Opc == Instruction::LShr ||
-                     Opc == Instruction::AShr) {
+          } else if (Opc == Instruction::SDiv || Opc == Instruction::UDiv ||
+                     Opc == Instruction::LShr || Opc == Instruction::AShr) {
             if (Record[3] & (1 << bitc::PEO_EXACT))
               Flags |= SDivOperator::IsExact;
           }
@@ -2563,24 +2671,25 @@ Error BitcodeReader::parseConstants() {
       }
       break;
     }
-    case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
+    case bitc::CST_CODE_CE_CAST: { // CE_CAST: [opcode, opty, opval]
       if (Record.size() < 3)
         return error("Invalid record");
       int Opc = getDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
-        V = UndefValue::get(CurTy);  // Unknown cast.
+        V = UndefValue::get(CurTy); // Unknown cast.
       } else {
         Type *OpTy = getTypeByID(Record[1]);
         if (!OpTy)
           return error("Invalid record");
         Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
         V = UpgradeBitCastExpr(Opc, Op, CurTy);
-        if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
+        if (!V)
+          V = ConstantExpr::getCast(Opc, Op, CurTy);
       }
       break;
     }
-    case bitc::CST_CODE_CE_INBOUNDS_GEP: // [ty, n x operands]
-    case bitc::CST_CODE_CE_GEP: // [ty, n x operands]
+    case bitc::CST_CODE_CE_INBOUNDS_GEP:             // [ty, n x operands]
+    case bitc::CST_CODE_CE_GEP:                      // [ty, n x operands]
     case bitc::CST_CODE_CE_GEP_WITH_INRANGE_INDEX: { // [ty, flags, n x
                                                      // operands]
       unsigned OpNum = 0;
@@ -2598,7 +2707,7 @@ Error BitcodeReader::parseConstants() {
       } else if (BitCode == bitc::CST_CODE_CE_INBOUNDS_GEP)
         InBounds = true;
 
-      SmallVector<Constant*, 16> Elts;
+      SmallVector<Constant *, 16> Elts;
       Type *Elt0FullTy = nullptr;
       while (OpNum != Record.size()) {
         if (!Elt0FullTy)
@@ -2625,7 +2734,7 @@ Error BitcodeReader::parseConstants() {
                                          InBounds, InRangeIndex);
       break;
     }
-    case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
+    case bitc::CST_CODE_CE_SELECT: { // CE_SELECT: [opval#, opval#, opval#]
       if (Record.size() < 3)
         return error("Invalid record");
 
@@ -2638,18 +2747,17 @@ Error BitcodeReader::parseConstants() {
           if (SelectorTy != V->getType())
             SelectorTy = VectorType::get(SelectorTy, VTy->getNumElements());
 
-      V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
-                                                              SelectorTy),
-                                  ValueList.getConstantFwdRef(Record[1],CurTy),
-                                  ValueList.getConstantFwdRef(Record[2],CurTy));
+      V = ConstantExpr::getSelect(
+          ValueList.getConstantFwdRef(Record[0], SelectorTy),
+          ValueList.getConstantFwdRef(Record[1], CurTy),
+          ValueList.getConstantFwdRef(Record[2], CurTy));
       break;
     }
-    case bitc::CST_CODE_CE_EXTRACTELT
-        : { // CE_EXTRACTELT: [opty, opval, opty, opval]
+    case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opty,
+                                         // opval]
       if (Record.size() < 3)
         return error("Invalid record");
-      VectorType *OpTy =
-        dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
+      VectorType *OpTy = dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (!OpTy)
         return error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
@@ -2666,14 +2774,14 @@ Error BitcodeReader::parseConstants() {
       V = ConstantExpr::getExtractElement(Op0, Op1);
       break;
     }
-    case bitc::CST_CODE_CE_INSERTELT
-        : { // CE_INSERTELT: [opval, opval, opty, opval]
+    case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opty,
+                                        // opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
         return error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
-      Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
-                                                  OpTy->getElementType());
+      Constant *Op1 =
+          ValueList.getConstantFwdRef(Record[1], OpTy->getElementType());
       Constant *Op2 = nullptr;
       if (Record.size() == 4) {
         Type *IdxTy = getTypeByID(Record[2]);
@@ -2693,27 +2801,26 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
-                                                 OpTy->getNumElements());
+      Type *ShufTy =
+          VectorType::get(Type::getInt32Ty(Context), OpTy->getNumElements());
       Constant *Op2 = ValueList.getConstantFwdRef(Record[2], ShufTy);
       V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
       break;
     }
     case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval]
       VectorType *RTy = dyn_cast<VectorType>(CurTy);
-      VectorType *OpTy =
-        dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
+      VectorType *OpTy = dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || !RTy || !OpTy)
         return error("Invalid record");
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
-      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
-                                                 RTy->getNumElements());
+      Type *ShufTy =
+          VectorType::get(Type::getInt32Ty(Context), RTy->getNumElements());
       Constant *Op2 = ValueList.getConstantFwdRef(Record[3], ShufTy);
       V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
       break;
     }
-    case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
+    case bitc::CST_CODE_CE_CMP: { // CE_CMP: [opty, opval, opval, pred]
       if (Record.size() < 4)
         return error("Invalid record");
       Type *OpTy = getTypeByID(Record[0]);
@@ -2737,16 +2844,16 @@ Error BitcodeReader::parseConstants() {
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
-      if (2+AsmStrSize >= Record.size())
+      if (2 + AsmStrSize >= Record.size())
         return error("Invalid record");
-      unsigned ConstStrSize = Record[2+AsmStrSize];
-      if (3+AsmStrSize+ConstStrSize > Record.size())
+      unsigned ConstStrSize = Record[2 + AsmStrSize];
+      if (3 + AsmStrSize + ConstStrSize > Record.size())
         return error("Invalid record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
-        AsmStr += (char)Record[2+i];
+        AsmStr += (char)Record[2 + i];
       for (unsigned i = 0; i != ConstStrSize; ++i)
-        ConstrStr += (char)Record[3+AsmStrSize+i];
+        ConstrStr += (char)Record[3 + AsmStrSize + i];
       UpgradeInlineAsmString(&AsmStr);
       V = InlineAsm::get(
           cast<FunctionType>(getPointerElementFlatType(CurFullTy)), AsmStr,
@@ -2763,16 +2870,16 @@ Error BitcodeReader::parseConstants() {
       bool IsAlignStack = (Record[0] >> 1) & 1;
       unsigned AsmDialect = Record[0] >> 2;
       unsigned AsmStrSize = Record[1];
-      if (2+AsmStrSize >= Record.size())
+      if (2 + AsmStrSize >= Record.size())
         return error("Invalid record");
-      unsigned ConstStrSize = Record[2+AsmStrSize];
-      if (3+AsmStrSize+ConstStrSize > Record.size())
+      unsigned ConstStrSize = Record[2 + AsmStrSize];
+      if (3 + AsmStrSize + ConstStrSize > Record.size())
         return error("Invalid record");
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
-        AsmStr += (char)Record[2+i];
+        AsmStr += (char)Record[2 + i];
       for (unsigned i = 0; i != ConstStrSize; ++i)
-        ConstrStr += (char)Record[3+AsmStrSize+i];
+        ConstrStr += (char)Record[3 + AsmStrSize + i];
       UpgradeInlineAsmString(&AsmStr);
       V = InlineAsm::get(
           cast<FunctionType>(getPointerElementFlatType(CurFullTy)), AsmStr,
@@ -2780,14 +2887,14 @@ Error BitcodeReader::parseConstants() {
           InlineAsm::AsmDialect(AsmDialect));
       break;
     }
-    case bitc::CST_CODE_BLOCKADDRESS:{
+    case bitc::CST_CODE_BLOCKADDRESS: {
       if (Record.size() < 3)
         return error("Invalid record");
       Type *FnTy = getTypeByID(Record[0]);
       if (!FnTy)
         return error("Invalid record");
-      Function *Fn =
-        dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
+      Function *Fn = dyn_cast_or_null<Function>(
+          ValueList.getConstantFwdRef(Record[1], FnTy));
       if (!Fn)
         return error("Invalid record");
 
@@ -2861,7 +2968,7 @@ Error BitcodeReader::parseUseLists() {
     if (!MaybeRecord)
       return MaybeRecord.takeError();
     switch (MaybeRecord.get()) {
-    default:  // Default behavior: unknown type.
+    default: // Default behavior: unknown type.
       break;
     case bitc::USELIST_CODE_BB:
       IsBB = true;
@@ -3011,7 +3118,8 @@ Error BitcodeReader::rememberAndSkipFunctionBodies() {
     return error("Could not find function in stream");
 
   if (!SeenFirstFunctionBody)
-    return error("Trying to materialize functions before seeing function blocks");
+    return error(
+        "Trying to materialize functions before seeing function blocks");
 
   // An old bitcode file with the symbol table at the end would have
   // finished the parse greedily.
@@ -3428,7 +3536,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
 
     case BitstreamEntry::SubBlock:
       switch (Entry.ID) {
-      default:  // Skip unknown content.
+      default: // Skip unknown content.
         if (Error Err = Stream.SkipBlock())
           return Err;
         break;
@@ -3565,7 +3673,8 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
     if (!MaybeBitCode)
       return MaybeBitCode.takeError();
     switch (unsigned BitCode = MaybeBitCode.get()) {
-    default: break;  // Default behavior, ignore unknown content.
+    default:
+      break; // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {
       Expected<unsigned> VersionOrErr = parseVersionRecord(Record);
       if (!VersionOrErr)
@@ -3573,28 +3682,28 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       UseRelativeIDs = *VersionOrErr >= 1;
       break;
     }
-    case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
+    case bitc::MODULE_CODE_TRIPLE: { // TRIPLE: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
       TheModule->setTargetTriple(S);
       break;
     }
-    case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
+    case bitc::MODULE_CODE_DATALAYOUT: { // DATALAYOUT: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
       TheModule->setDataLayout(S);
       break;
     }
-    case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
+    case bitc::MODULE_CODE_ASM: { // ASM: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
       TheModule->setModuleInlineAsm(S);
       break;
     }
-    case bitc::MODULE_CODE_DEPLIB: {  // DEPLIB: [strchr x N]
+    case bitc::MODULE_CODE_DEPLIB: { // DEPLIB: [strchr x N]
       // FIXME: Remove in 4.0.
       std::string S;
       if (convertToString(Record, 0, S))
@@ -3602,14 +3711,14 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       // Ignore value.
       break;
     }
-    case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
+    case bitc::MODULE_CODE_SECTIONNAME: { // SECTIONNAME: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
       SectionTable.push_back(S);
       break;
     }
-    case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
+    case bitc::MODULE_CODE_GCNAME: { // SECTIONNAME: [strchr x N]
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
@@ -3683,9 +3792,9 @@ void BitcodeReader::propagateByValTypes(CallBase *CB,
       continue;
 
     CB->removeParamAttr(i, Attribute::ByVal);
-    CB->addParamAttr(
-        i, Attribute::getWithByValType(
-               Context, getPointerElementFlatType(ArgsFullTys[i])));
+    CB->addParamAttr(i,
+                     Attribute::getWithByValType(
+                         Context, getPointerElementFlatType(ArgsFullTys[i])));
   }
 }
 
@@ -3743,7 +3852,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
     case BitstreamEntry::SubBlock:
       switch (Entry.ID) {
-      default:  // Skip unknown content.
+      default: // Skip unknown content.
         if (Error Err = Stream.SkipBlock())
           return Err;
         break;
@@ -3788,7 +3897,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     switch (unsigned BitCode = MaybeBitCode.get()) {
     default: // Default behavior: reject
       return error("Invalid value");
-    case bitc::FUNC_CODE_DECLAREBLOCKS: {   // DECLAREBLOCKS: [nblocks]
+    case bitc::FUNC_CODE_DECLAREBLOCKS: { // DECLAREBLOCKS: [nblocks]
       if (Record.size() < 1 || Record[0] == 0)
         return error("Invalid record");
       // Create all the basic blocks for the function.
@@ -3823,7 +3932,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       continue;
     }
 
-    case bitc::FUNC_CODE_DEBUG_LOC_AGAIN:  // DEBUG_LOC_AGAIN
+    case bitc::FUNC_CODE_DEBUG_LOC_AGAIN: // DEBUG_LOC_AGAIN
       // This record indicates that the last instruction is at the same
       // location as the previous instruction with a location.
       I = getLastInstruction();
@@ -3834,7 +3943,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       I = nullptr;
       continue;
 
-    case bitc::FUNC_CODE_DEBUG_LOC: {      // DEBUG_LOC: [line, col, scope, ia]
+    case bitc::FUNC_CODE_DEBUG_LOC: { // DEBUG_LOC: [line, col, scope, ia]
       I = getLastInstruction();
       if (!I || Record.size() < 4)
         return error("Invalid record");
@@ -3861,11 +3970,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       I = nullptr;
       continue;
     }
-    case bitc::FUNC_CODE_INST_UNOP: {    // UNOP: [opval, ty, opcode]
+    case bitc::FUNC_CODE_INST_UNOP: { // UNOP: [opval, ty, opcode]
       unsigned OpNum = 0;
       Value *LHS;
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          OpNum+1 > Record.size())
+          OpNum + 1 > Record.size())
         return error("Invalid record");
 
       int Opc = getDecodedUnaryOpcode(Record[OpNum++], LHS->getType());
@@ -3882,12 +3991,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
       break;
     }
-    case bitc::FUNC_CODE_INST_BINOP: {    // BINOP: [opval, ty, opval, opcode]
+    case bitc::FUNC_CODE_INST_BINOP: { // BINOP: [opval, ty, opval, opcode]
       unsigned OpNum = 0;
       Value *LHS, *RHS;
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
-          OpNum+1 > Record.size())
+          OpNum + 1 > Record.size())
         return error("Invalid record");
 
       int Opc = getDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
@@ -3896,18 +4005,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
-        if (Opc == Instruction::Add ||
-            Opc == Instruction::Sub ||
-            Opc == Instruction::Mul ||
-            Opc == Instruction::Shl) {
+        if (Opc == Instruction::Add || Opc == Instruction::Sub ||
+            Opc == Instruction::Mul || Opc == Instruction::Shl) {
           if (Record[OpNum] & (1 << bitc::OBO_NO_SIGNED_WRAP))
             cast<BinaryOperator>(I)->setHasNoSignedWrap(true);
           if (Record[OpNum] & (1 << bitc::OBO_NO_UNSIGNED_WRAP))
             cast<BinaryOperator>(I)->setHasNoUnsignedWrap(true);
-        } else if (Opc == Instruction::SDiv ||
-                   Opc == Instruction::UDiv ||
-                   Opc == Instruction::LShr ||
-                   Opc == Instruction::AShr) {
+        } else if (Opc == Instruction::SDiv || Opc == Instruction::UDiv ||
+                   Opc == Instruction::LShr || Opc == Instruction::AShr) {
           if (Record[OpNum] & (1 << bitc::PEO_EXACT))
             cast<BinaryOperator>(I)->setIsExact(true);
         } else if (isa<FPMathOperator>(I)) {
@@ -3915,15 +4020,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           if (FMF.any())
             I->setFastMathFlags(FMF);
         }
-
       }
       break;
     }
-    case bitc::FUNC_CODE_INST_CAST: {    // CAST: [opval, opty, destty, castopc]
+    case bitc::FUNC_CODE_INST_CAST: { // CAST: [opval, opty, destty, castopc]
       unsigned OpNum = 0;
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
-          OpNum+2 != Record.size())
+          OpNum + 2 != Record.size())
         return error("Invalid record");
 
       FullTy = getFullyStructuredTypeByID(Record[OpNum]);
@@ -3975,7 +4079,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error(
             "Explicit gep type does not match pointee type of pointer operand");
 
-      SmallVector<Value*, 16> GEPIdx;
+      SmallVector<Value *, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
@@ -3993,7 +4097,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     }
 
     case bitc::FUNC_CODE_INST_EXTRACTVAL: {
-                                       // EXTRACTVAL: [opty, opval, n x indices]
+      // EXTRACTVAL: [opty, opval, n x indices]
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg, &FullTy))
@@ -4031,7 +4135,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     }
 
     case bitc::FUNC_CODE_INST_INSERTVAL: {
-                           // INSERTVAL: [opty, opval, opty, opval, n x indices]
+      // INSERTVAL: [opty, opval, opty, opval, n x indices]
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg, &FullTy))
@@ -4090,7 +4194,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
 
-    case bitc::FUNC_CODE_INST_VSELECT: {// VSELECT: [ty,opval,opval,predty,pred]
+    case bitc::FUNC_CODE_INST_VSELECT: { // VSELECT:
+                                         // [ty,opval,opval,predty,pred]
       // new form of select
       // handles select i1 or select [N x i1]
       unsigned OpNum = 0;
@@ -4101,8 +4206,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
 
       // select condition can be either i1 or [N x i1]
-      if (VectorType* vector_type =
-          dyn_cast<VectorType>(Cond->getType())) {
+      if (VectorType *vector_type = dyn_cast<VectorType>(Cond->getType())) {
         // expect <n x i1>
         if (vector_type->getElementType() != Type::getInt1Ty(Context))
           return error("Invalid type for value");
@@ -4152,7 +4256,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
 
-    case bitc::FUNC_CODE_INST_SHUFFLEVEC: {// SHUFFLEVEC: [opval,ty,opval,opval]
+    case bitc::FUNC_CODE_INST_SHUFFLEVEC: { // SHUFFLEVEC:
+                                            // [opval,ty,opval,opval]
       unsigned OpNum = 0;
       Value *Vec1, *Vec2, *Mask;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec1, &FullTy) ||
@@ -4170,10 +4275,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
 
-    case bitc::FUNC_CODE_INST_CMP:   // CMP: [opty, opval, opval, pred]
-      // Old form of ICmp/FCmp returning bool
-      // Existed to differentiate between icmp/fcmp and vicmp/vfcmp which were
-      // both legal on vectors but had different behaviour.
+    case bitc::FUNC_CODE_INST_CMP: // CMP: [opty, opval, opval, pred]
+                                   // Old form of ICmp/FCmp returning bool
+                                   // Existed to differentiate between icmp/fcmp
+                                   // and vicmp/vfcmp which were both legal on
+                                   // vectors but had different behaviour.
     case bitc::FUNC_CODE_INST_CMP2: { // CMP2: [opty, opval, opval, pred]
       // FCmp/ICmp returning bool or vector of bool
 
@@ -4190,10 +4296,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned PredVal = Record[OpNum];
       bool IsFP = LHS->getType()->isFPOrFPVectorTy();
       FastMathFlags FMF;
-      if (IsFP && Record.size() > OpNum+1)
+      if (IsFP && Record.size() > OpNum + 1)
         FMF = getDecodedFastMathFlags(Record[++OpNum]);
 
-      if (OpNum+1 != Record.size())
+      if (OpNum + 1 != Record.size())
         return error("Invalid record");
 
       if (LHS->getType()->isFPOrFPVectorTy())
@@ -4208,25 +4314,25 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
     }
 
     case bitc::FUNC_CODE_INST_RET: // RET: [opty,opval<optional>]
-      {
-        unsigned Size = Record.size();
-        if (Size == 0) {
-          I = ReturnInst::Create(Context);
-          InstructionList.push_back(I);
-          break;
-        }
-
-        unsigned OpNum = 0;
-        Value *Op = nullptr;
-        if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return error("Invalid record");
-        if (OpNum != Record.size())
-          return error("Invalid record");
-
-        I = ReturnInst::Create(Context, Op);
+    {
+      unsigned Size = Record.size();
+      if (Size == 0) {
+        I = ReturnInst::Create(Context);
         InstructionList.push_back(I);
         break;
       }
+
+      unsigned OpNum = 0;
+      Value *Op = nullptr;
+      if (getValueTypePair(Record, OpNum, NextValueNo, Op))
+        return error("Invalid record");
+      if (OpNum != Record.size())
+        return error("Invalid record");
+
+      I = ReturnInst::Create(Context, Op);
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
       if (Record.size() != 1 && Record.size() != 3)
         return error("Invalid record");
@@ -4237,11 +4343,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() == 1) {
         I = BranchInst::Create(TrueDest);
         InstructionList.push_back(I);
-      }
-      else {
+      } else {
         BasicBlock *FalseDest = getBasicBlock(Record[1]);
-        Value *Cond = getValue(Record, 2, NextValueNo,
-                               Type::getInt1Ty(Context));
+        Value *Cond =
+            getValue(Record, 2, NextValueNo, Type::getInt1Ty(Context));
         if (!FalseDest || !Cond)
           return error("Invalid record");
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
@@ -4376,7 +4481,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
 
         unsigned CurIdx = 5;
         for (unsigned i = 0; i != NumCases; ++i) {
-          SmallVector<ConstantInt*, 1> CaseVals;
+          SmallVector<ConstantInt *, 1> CaseVals;
           unsigned NumItems = Record[CurIdx++];
           for (unsigned ci = 0; ci != NumItems; ++ci) {
             bool isSingleNumber = Record[CurIdx++];
@@ -4401,14 +4506,15 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
               // compared as signed or unsigned values. The partially
               // implemented changes that used this format in the past used
               // unsigned comparisons.
-              for ( ; Low.ule(High); ++Low)
+              for (; Low.ule(High); ++Low)
                 CaseVals.push_back(ConstantInt::get(Context, Low));
             } else
               CaseVals.push_back(ConstantInt::get(Context, Low));
           }
           BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
-          for (SmallVector<ConstantInt*, 1>::iterator cvi = CaseVals.begin(),
-                 cve = CaseVals.end(); cvi != cve; ++cvi)
+          for (SmallVector<ConstantInt *, 1>::iterator cvi = CaseVals.begin(),
+                                                       cve = CaseVals.end();
+               cvi != cve; ++cvi)
             SI->addCase(*cvi, DestBB);
         }
         I = SI;
@@ -4424,13 +4530,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (!OpTy || !Cond || !Default)
         return error("Invalid record");
-      unsigned NumCases = (Record.size()-3)/2;
+      unsigned NumCases = (Record.size() - 3) / 2;
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
       for (unsigned i = 0, e = NumCases; i != e; ++i) {
-        ConstantInt *CaseVal =
-          dyn_cast_or_null<ConstantInt>(getFnValueByID(Record[3+i*2], OpTy));
-        BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
+        ConstantInt *CaseVal = dyn_cast_or_null<ConstantInt>(
+            getFnValueByID(Record[3 + i * 2], OpTy));
+        BasicBlock *DestBB = getBasicBlock(Record[1 + 3 + i * 2]);
         if (!CaseVal || !DestBB) {
           delete SI;
           return error("Invalid record");
@@ -4447,11 +4553,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       Value *Address = getValue(Record, 1, NextValueNo, OpTy);
       if (!OpTy || !Address)
         return error("Invalid record");
-      unsigned NumDests = Record.size()-2;
+      unsigned NumDests = Record.size() - 2;
       IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
       InstructionList.push_back(IBI);
       for (unsigned i = 0, e = NumDests; i != e; ++i) {
-        if (BasicBlock *DestBB = getBasicBlock(Record[2+i])) {
+        if (BasicBlock *DestBB = getBasicBlock(Record[2 + i])) {
           IBI->addDestination(DestBB);
         } else {
           delete IBI;
@@ -4501,11 +4607,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() < FTy->getNumParams() + OpNum)
         return error("Insufficient operands to call");
 
-      SmallVector<Value*, 16> Ops;
+      SmallVector<Value *, 16> Ops;
       SmallVector<Type *, 16> ArgsFullTys;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
-        Ops.push_back(getValue(Record, OpNum, NextValueNo,
-                               FTy->getParamType(i)));
+        Ops.push_back(
+            getValue(Record, OpNum, NextValueNo, FTy->getParamType(i)));
         ArgsFullTys.push_back(FullFTy->getParamType(i));
         if (!Ops.back())
           return error("Invalid record");
@@ -4588,14 +4694,14 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() < FTy->getNumParams() + OpNum)
         return error("Insufficient operands to call");
 
-      SmallVector<Value*, 16> Args;
+      SmallVector<Value *, 16> Args;
       // Read the fixed params.
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         if (FTy->getParamType(i)->isLabelTy())
           Args.push_back(getBasicBlock(Record[OpNum]));
         else
-          Args.push_back(getValue(Record, OpNum, NextValueNo,
-                                  FTy->getParamType(i)));
+          Args.push_back(
+              getValue(Record, OpNum, NextValueNo, FTy->getParamType(i)));
         if (!Args.back())
           return error("Invalid record");
       }
@@ -4628,26 +4734,26 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       InstructionList.push_back(I);
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
-      if (Record.size() < 1 || ((Record.size()-1)&1))
+      if (Record.size() < 1 || ((Record.size() - 1) & 1))
         return error("Invalid record");
       FullTy = getFullyStructuredTypeByID(Record[0]);
       Type *Ty = flattenPointerTypes(FullTy);
       if (!Ty)
         return error("Invalid record");
 
-      PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
+      PHINode *PN = PHINode::Create(Ty, (Record.size() - 1) / 2);
       InstructionList.push_back(PN);
 
-      for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) {
+      for (unsigned i = 0, e = Record.size() - 1; i != e; i += 2) {
         Value *V;
         // With the new function encoding, it is possible that operands have
         // negative IDs (for forward references).  Use a signed VBR
         // representation to keep the encoding small.
         if (UseRelativeIDs)
-          V = getValueSigned(Record, 1+i, NextValueNo, Ty);
+          V = getValueSigned(Record, 1 + i, NextValueNo, Ty);
         else
-          V = getValue(Record, 1+i, NextValueNo, Ty);
-        BasicBlock *BB = getBasicBlock(Record[2+i]);
+          V = getValue(Record, 1 + i, NextValueNo, Ty);
+        BasicBlock *BB = getBasicBlock(Record[2 + i]);
         if (!V || !BB)
           return error("Invalid record");
         PN->addIncoming(V, BB);
@@ -4689,7 +4795,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       LP->setCleanup(IsCleanup);
       for (unsigned J = 0; J != NumClauses; ++J) {
         LandingPadInst::ClauseType CT =
-          LandingPadInst::ClauseType(Record[Idx++]); (void)CT;
+            LandingPadInst::ClauseType(Record[Idx++]);
+        (void)CT;
         Value *Val;
 
         if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
@@ -4697,12 +4804,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           return error("Invalid record");
         }
 
-        assert((CT != LandingPadInst::Catch ||
-                !isa<ArrayType>(Val->getType())) &&
-               "Catch clause has a invalid type!");
-        assert((CT != LandingPadInst::Filter ||
-                isa<ArrayType>(Val->getType())) &&
-               "Filter clause has invalid type!");
+        assert(
+            (CT != LandingPadInst::Catch || !isa<ArrayType>(Val->getType())) &&
+            "Catch clause has a invalid type!");
+        assert(
+            (CT != LandingPadInst::Filter || isa<ArrayType>(Val->getType())) &&
+            "Filter clause has invalid type!");
         LP->addClause(cast<Constant>(Val));
       }
 
@@ -4718,8 +4825,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const uint64_t InAllocaMask = uint64_t(1) << 5;
       const uint64_t ExplicitTypeMask = uint64_t(1) << 6;
       const uint64_t SwiftErrorMask = uint64_t(1) << 7;
-      const uint64_t FlagMask = InAllocaMask | ExplicitTypeMask |
-                                SwiftErrorMask;
+      const uint64_t FlagMask =
+          InAllocaMask | ExplicitTypeMask | SwiftErrorMask;
       bool InAlloca = AlignRecord & InAllocaMask;
       bool SwiftError = AlignRecord & SwiftErrorMask;
       FullTy = getFullyStructuredTypeByID(Record[0]);
@@ -4779,7 +4886,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
     case bitc::FUNC_CODE_INST_LOADATOMIC: {
-       // LOADATOMIC: [opty, op, align, vol, ordering, ssid]
+      // LOADATOMIC: [opty, op, align, vol, ordering, ssid]
       unsigned OpNum = 0;
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op, &FullTy) ||
@@ -4816,7 +4923,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       break;
     }
     case bitc::FUNC_CODE_INST_STORE:
-    case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align, vol]
+    case bitc::FUNC_CODE_INST_STORE_OLD: { // STORE2:[ptrty, ptr, val, align,
+                                           // vol]
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       Type *FullTy;
@@ -4833,7 +4941,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align);
+      I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align);
       InstructionList.push_back(I);
       break;
     }
@@ -4866,7 +4974,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       unsigned Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new StoreInst(Val, Ptr, Record[OpNum+1], Align, Ordering, SSID);
+      I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align, Ordering, SSID);
       InstructionList.push_back(I);
       break;
     }
@@ -4923,7 +5031,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         I = ExtractValueInst::Create(I, 0);
         FullTy = cast<StructType>(FullTy)->getElementType(0);
       } else {
-        cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum+4]);
+        cast<AtomicCmpXchgInst>(I)->setWeak(Record[OpNum + 4]);
       }
 
       InstructionList.push_back(I);
@@ -4950,7 +5058,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
       I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID);
       FullTy = getPointerElementFlatType(FullTy);
-      cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
+      cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum + 1]);
       InstructionList.push_back(I);
       break;
     }
@@ -5012,15 +5120,15 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() < FTy->getNumParams() + OpNum)
         return error("Insufficient operands to call");
 
-      SmallVector<Value*, 16> Args;
-      SmallVector<Type*, 16> ArgsFullTys;
+      SmallVector<Value *, 16> Args;
+      SmallVector<Type *, 16> ArgsFullTys;
       // Read the fixed params.
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         if (FTy->getParamType(i)->isLabelTy())
           Args.push_back(getBasicBlock(Record[OpNum]));
         else
-          Args.push_back(getValue(Record, OpNum, NextValueNo,
-                                  FTy->getParamType(i)));
+          Args.push_back(
+              getValue(Record, OpNum, NextValueNo, FTy->getParamType(i)));
         ArgsFullTys.push_back(FullFTy->getParamType(i));
         if (!Args.back())
           return error("Invalid record");
@@ -5149,7 +5257,8 @@ OutOfRecordLoop:
   if (Argument *A = dyn_cast<Argument>(ValueList.back())) {
     if (!A->getParent()) {
       // We found at least one unresolved value.  Nuke them all to avoid leaks.
-      for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e; ++i){
+      for (unsigned i = ModuleValueListSize, e = ValueList.size(); i != e;
+           ++i) {
         if ((A = dyn_cast_or_null<Argument>(ValueList[i])) && !A->getParent()) {
           A->replaceAllUsesWith(UndefValue::get(A->getType()));
           delete A;
@@ -5166,7 +5275,7 @@ OutOfRecordLoop:
   // Trim the value list down to the size it was before we parsed this function.
   ValueList.shrinkTo(ModuleValueListSize);
   MDLoader->shrinkTo(ModuleMDLoaderSize);
-  std::vector<BasicBlock*>().swap(FunctionBBs);
+  std::vector<BasicBlock *>().swap(FunctionBBs);
   return Error::success();
 }
 
@@ -5206,7 +5315,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
   if (!F || !F->isMaterializable())
     return Error::success();
 
-  DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
+  DenseMap<Function *, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
   assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
   // If its position is recorded as 0, its body is somewhere in the stream
   // but we haven't seen it yet.
@@ -5367,8 +5476,7 @@ void ModuleSummaryIndexBitcodeReader::setValueGUID(
   // the index so that the value name can be recorded.
   ValueIdToValueInfoMap[ValueID] = std::make_pair(
       TheIndex.getOrInsertValueInfo(
-          ValueGUID,
-          UseStrtab ? ValueName : TheIndex.saveString(ValueName)),
+          ValueGUID, UseStrtab ? ValueName : TheIndex.saveString(ValueName)),
       OriginalNameID);
 }
 
@@ -5537,71 +5645,71 @@ Error ModuleSummaryIndexBitcodeReader::parseModule() {
       continue;
 
     case BitstreamEntry::Record: {
-        Record.clear();
-        Expected<unsigned> MaybeBitCode = Stream.readRecord(Entry.ID, Record);
-        if (!MaybeBitCode)
-          return MaybeBitCode.takeError();
-        switch (MaybeBitCode.get()) {
-        default:
-          break; // Default behavior, ignore unknown content.
-        case bitc::MODULE_CODE_VERSION: {
-          if (Error Err = parseVersionRecord(Record).takeError())
-            return Err;
-          break;
-        }
-        /// MODULE_CODE_SOURCE_FILENAME: [namechar x N]
-        case bitc::MODULE_CODE_SOURCE_FILENAME: {
-          SmallString<128> ValueName;
-          if (convertToString(Record, 0, ValueName))
-            return error("Invalid record");
-          SourceFileName = ValueName.c_str();
-          break;
+      Record.clear();
+      Expected<unsigned> MaybeBitCode = Stream.readRecord(Entry.ID, Record);
+      if (!MaybeBitCode)
+        return MaybeBitCode.takeError();
+      switch (MaybeBitCode.get()) {
+      default:
+        break; // Default behavior, ignore unknown content.
+      case bitc::MODULE_CODE_VERSION: {
+        if (Error Err = parseVersionRecord(Record).takeError())
+          return Err;
+        break;
+      }
+      /// MODULE_CODE_SOURCE_FILENAME: [namechar x N]
+      case bitc::MODULE_CODE_SOURCE_FILENAME: {
+        SmallString<128> ValueName;
+        if (convertToString(Record, 0, ValueName))
+          return error("Invalid record");
+        SourceFileName = ValueName.c_str();
+        break;
+      }
+      /// MODULE_CODE_HASH: [5*i32]
+      case bitc::MODULE_CODE_HASH: {
+        if (Record.size() != 5)
+          return error("Invalid hash length " + Twine(Record.size()).str());
+        auto &Hash = getThisModule()->second.second;
+        int Pos = 0;
+        for (auto &Val : Record) {
+          assert(!(Val >> 32) && "Unexpected high bits set");
+          Hash[Pos++] = Val;
         }
-        /// MODULE_CODE_HASH: [5*i32]
-        case bitc::MODULE_CODE_HASH: {
-          if (Record.size() != 5)
-            return error("Invalid hash length " + Twine(Record.size()).str());
-          auto &Hash = getThisModule()->second.second;
-          int Pos = 0;
-          for (auto &Val : Record) {
-            assert(!(Val >> 32) && "Unexpected high bits set");
-            Hash[Pos++] = Val;
-          }
+        break;
+      }
+      /// MODULE_CODE_VSTOFFSET: [offset]
+      case bitc::MODULE_CODE_VSTOFFSET:
+        if (Record.size() < 1)
+          return error("Invalid record");
+        // Note that we subtract 1 here because the offset is relative to one
+        // word before the start of the identification or module block, which
+        // was historically always the start of the regular bitcode header.
+        VSTOffset = Record[0] - 1;
+        break;
+      // v1 GLOBALVAR: [pointer type, isconst,     initid,       linkage, ...]
+      // v1 FUNCTION:  [type,         callingconv, isproto,      linkage, ...]
+      // v1 ALIAS:     [alias type,   addrspace,   aliasee val#, linkage, ...]
+      // v2: [strtab offset, strtab size, v1]
+      case bitc::MODULE_CODE_GLOBALVAR:
+      case bitc::MODULE_CODE_FUNCTION:
+      case bitc::MODULE_CODE_ALIAS: {
+        StringRef Name;
+        ArrayRef<uint64_t> GVRecord;
+        std::tie(Name, GVRecord) = readNameFromStrtab(Record);
+        if (GVRecord.size() <= 3)
+          return error("Invalid record");
+        uint64_t RawLinkage = GVRecord[3];
+        GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
+        if (!UseStrtab) {
+          ValueIdToLinkageMap[ValueId++] = Linkage;
           break;
         }
-        /// MODULE_CODE_VSTOFFSET: [offset]
-        case bitc::MODULE_CODE_VSTOFFSET:
-          if (Record.size() < 1)
-            return error("Invalid record");
-          // Note that we subtract 1 here because the offset is relative to one
-          // word before the start of the identification or module block, which
-          // was historically always the start of the regular bitcode header.
-          VSTOffset = Record[0] - 1;
-          break;
-        // v1 GLOBALVAR: [pointer type, isconst,     initid,       linkage, ...]
-        // v1 FUNCTION:  [type,         callingconv, isproto,      linkage, ...]
-        // v1 ALIAS:     [alias type,   addrspace,   aliasee val#, linkage, ...]
-        // v2: [strtab offset, strtab size, v1]
-        case bitc::MODULE_CODE_GLOBALVAR:
-        case bitc::MODULE_CODE_FUNCTION:
-        case bitc::MODULE_CODE_ALIAS: {
-          StringRef Name;
-          ArrayRef<uint64_t> GVRecord;
-          std::tie(Name, GVRecord) = readNameFromStrtab(Record);
-          if (GVRecord.size() <= 3)
-            return error("Invalid record");
-          uint64_t RawLinkage = GVRecord[3];
-          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          if (!UseStrtab) {
-            ValueIdToLinkageMap[ValueId++] = Linkage;
-            break;
-          }
 
-          setValueGUID(ValueId++, Name, Linkage, SourceFileName);
-          break;
-        }
-        }
+        setValueGUID(ValueId++, Name, Linkage, SourceFileName);
+        break;
+      }
       }
+    }
       continue;
     }
   }
@@ -5796,7 +5904,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     switch (unsigned BitCode = MaybeBitCode.get()) {
     default: // Default behavior: ignore.
       break;
-    case bitc::FS_FLAGS: {  // [flags]
+    case bitc::FS_FLAGS: { // [flags]
       uint64_t Flags = Record[0];
       // Scan flags.
       assert(Flags <= 0x1f && "Unexpected bits in flag");
@@ -5915,7 +6023,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       AS->setModulePath(getThisModule()->first());
 
       auto AliaseeVI = getValueInfoFromValueId(AliaseeID).first;
-      auto AliaseeInModule = TheIndex.findSummaryInModule(AliaseeVI, ModulePath);
+      auto AliaseeInModule =
+          TheIndex.findSummaryInModule(AliaseeVI, ModulePath);
       if (!AliaseeInModule)
         return error("Alias expects aliasee summary to be parsed");
       AS->setAliasee(AliaseeVI, AliaseeInModule);
@@ -6057,7 +6166,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       AS->setModulePath(ModuleIdMap[ModuleId]);
 
       auto AliaseeVI = getValueInfoFromValueId(AliaseeValueId).first;
-      auto AliaseeInModule = TheIndex.findSummaryInModule(AliaseeVI, AS->modulePath());
+      auto AliaseeInModule =
+          TheIndex.findSummaryInModule(AliaseeVI, AS->modulePath());
       AS->setAliasee(AliaseeVI, AliaseeInModule);
 
       ValueInfo VI = getValueInfoFromValueId(ValueID).first;
@@ -6110,13 +6220,13 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     case bitc::FS_TYPE_TEST_ASSUME_VCALLS:
       assert(PendingTypeTestAssumeVCalls.empty());
       for (unsigned I = 0; I != Record.size(); I += 2)
-        PendingTypeTestAssumeVCalls.push_back({Record[I], Record[I+1]});
+        PendingTypeTestAssumeVCalls.push_back({Record[I], Record[I + 1]});
       break;
 
     case bitc::FS_TYPE_CHECKED_LOAD_VCALLS:
       assert(PendingTypeCheckedLoadVCalls.empty());
       for (unsigned I = 0; I != Record.size(); I += 2)
-        PendingTypeCheckedLoadVCalls.push_back({Record[I], Record[I+1]});
+        PendingTypeCheckedLoadVCalls.push_back({Record[I], Record[I + 1]});
       break;
 
     case bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL:
@@ -6231,9 +6341,7 @@ namespace {
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
 class BitcodeErrorCategoryType : public std::error_category {
-  const char *name() const noexcept override {
-    return "llvm.bitcode";
-  }
+  const char *name() const noexcept override { return "llvm.bitcode"; }
 
   std::string message(int IE) const override {
     BitcodeError E = static_cast<BitcodeError>(IE);
diff --git a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
index 5bd970432159c6bc7eee1cb2061e2891f07ddc7b..55e7415efbea2b37d85f20b1d123ce9a80efe67e 100644
--- a/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/hpvm/llvm_patches/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -24,9 +24,9 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Bitstream/BitCodes.h"
 #include "llvm/Bitstream/BitstreamWriter.h"
-#include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -339,8 +339,8 @@ private:
                              unsigned Abbrev);
   void writeDILocalVariable(const DILocalVariable *N,
                             SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
-  void writeDILabel(const DILabel *N,
-                    SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
+  void writeDILabel(const DILabel *N, SmallVectorImpl<uint64_t> &Record,
+                    unsigned Abbrev);
   void writeDIExpression(const DIExpression *N,
                          SmallVectorImpl<uint64_t> &Record, unsigned Abbrev);
   void writeDIGlobalVariableExpression(const DIGlobalVariableExpression *N,
@@ -391,9 +391,7 @@ private:
   void writeBlockInfo();
   void writeModuleHash(size_t BlockStartPos);
 
-  unsigned getEncodedSyncScopeID(SyncScope::ID SSID) {
-    return unsigned(SSID);
-  }
+  unsigned getEncodedSyncScopeID(SyncScope::ID SSID) { return unsigned(SSID); }
 };
 
 /// Class to manage the bitcode writing for a combined index.
@@ -426,9 +424,8 @@ public:
     // in writing out the call graph edges. Save the mapping from GUID
     // to the new global value id to use when writing those edges, which
     // are currently saved in the index in terms of GUID.
-    forEachSummary([&](GVInfo I, bool) {
-      GUIDToValueIdMap[I.first] = ++GlobalValueId;
-    });
+    forEachSummary(
+        [&](GVInfo I, bool) { GUIDToValueIdMap[I.first] = ++GlobalValueId; });
   }
 
   /// The below iterator returns the GUID and associated summary.
@@ -437,8 +434,7 @@ public:
   /// Calls the callback for each value GUID and summary to be written to
   /// bitcode. This hides the details of whether they are being pulled from the
   /// entire index or just those in a provided ModuleToSummariesForIndex map.
-  template<typename Functor>
-  void forEachSummary(Functor Callback) {
+  template <typename Functor> void forEachSummary(Functor Callback) {
     if (ModuleToSummariesForIndex) {
       for (auto &M : *ModuleToSummariesForIndex)
         for (auto &Summary : M.second) {
@@ -500,82 +496,133 @@ private:
 
 static unsigned getEncodedCastOpcode(unsigned Opcode) {
   switch (Opcode) {
-  default: llvm_unreachable("Unknown cast instruction!");
-  case Instruction::Trunc   : return bitc::CAST_TRUNC;
-  case Instruction::ZExt    : return bitc::CAST_ZEXT;
-  case Instruction::SExt    : return bitc::CAST_SEXT;
-  case Instruction::FPToUI  : return bitc::CAST_FPTOUI;
-  case Instruction::FPToSI  : return bitc::CAST_FPTOSI;
-  case Instruction::UIToFP  : return bitc::CAST_UITOFP;
-  case Instruction::SIToFP  : return bitc::CAST_SITOFP;
-  case Instruction::FPTrunc : return bitc::CAST_FPTRUNC;
-  case Instruction::FPExt   : return bitc::CAST_FPEXT;
-  case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
-  case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
-  case Instruction::BitCast : return bitc::CAST_BITCAST;
-  case Instruction::AddrSpaceCast: return bitc::CAST_ADDRSPACECAST;
+  default:
+    llvm_unreachable("Unknown cast instruction!");
+  case Instruction::Trunc:
+    return bitc::CAST_TRUNC;
+  case Instruction::ZExt:
+    return bitc::CAST_ZEXT;
+  case Instruction::SExt:
+    return bitc::CAST_SEXT;
+  case Instruction::FPToUI:
+    return bitc::CAST_FPTOUI;
+  case Instruction::FPToSI:
+    return bitc::CAST_FPTOSI;
+  case Instruction::UIToFP:
+    return bitc::CAST_UITOFP;
+  case Instruction::SIToFP:
+    return bitc::CAST_SITOFP;
+  case Instruction::FPTrunc:
+    return bitc::CAST_FPTRUNC;
+  case Instruction::FPExt:
+    return bitc::CAST_FPEXT;
+  case Instruction::PtrToInt:
+    return bitc::CAST_PTRTOINT;
+  case Instruction::IntToPtr:
+    return bitc::CAST_INTTOPTR;
+  case Instruction::BitCast:
+    return bitc::CAST_BITCAST;
+  case Instruction::AddrSpaceCast:
+    return bitc::CAST_ADDRSPACECAST;
   }
 }
 
 static unsigned getEncodedUnaryOpcode(unsigned Opcode) {
   switch (Opcode) {
-  default: llvm_unreachable("Unknown binary instruction!");
-  case Instruction::FNeg: return bitc::UNOP_NEG;
+  default:
+    llvm_unreachable("Unknown binary instruction!");
+  case Instruction::FNeg:
+    return bitc::UNOP_NEG;
   }
 }
 
 static unsigned getEncodedBinaryOpcode(unsigned Opcode) {
   switch (Opcode) {
-  default: llvm_unreachable("Unknown binary instruction!");
+  default:
+    llvm_unreachable("Unknown binary instruction!");
   case Instruction::Add:
-  case Instruction::FAdd: return bitc::BINOP_ADD;
+  case Instruction::FAdd:
+    return bitc::BINOP_ADD;
   case Instruction::Sub:
-  case Instruction::FSub: return bitc::BINOP_SUB;
+  case Instruction::FSub:
+    return bitc::BINOP_SUB;
   case Instruction::Mul:
-  case Instruction::FMul: return bitc::BINOP_MUL;
-  case Instruction::UDiv: return bitc::BINOP_UDIV;
+  case Instruction::FMul:
+    return bitc::BINOP_MUL;
+  case Instruction::UDiv:
+    return bitc::BINOP_UDIV;
   case Instruction::FDiv:
-  case Instruction::SDiv: return bitc::BINOP_SDIV;
-  case Instruction::URem: return bitc::BINOP_UREM;
+  case Instruction::SDiv:
+    return bitc::BINOP_SDIV;
+  case Instruction::URem:
+    return bitc::BINOP_UREM;
   case Instruction::FRem:
-  case Instruction::SRem: return bitc::BINOP_SREM;
-  case Instruction::Shl:  return bitc::BINOP_SHL;
-  case Instruction::LShr: return bitc::BINOP_LSHR;
-  case Instruction::AShr: return bitc::BINOP_ASHR;
-  case Instruction::And:  return bitc::BINOP_AND;
-  case Instruction::Or:   return bitc::BINOP_OR;
-  case Instruction::Xor:  return bitc::BINOP_XOR;
+  case Instruction::SRem:
+    return bitc::BINOP_SREM;
+  case Instruction::Shl:
+    return bitc::BINOP_SHL;
+  case Instruction::LShr:
+    return bitc::BINOP_LSHR;
+  case Instruction::AShr:
+    return bitc::BINOP_ASHR;
+  case Instruction::And:
+    return bitc::BINOP_AND;
+  case Instruction::Or:
+    return bitc::BINOP_OR;
+  case Instruction::Xor:
+    return bitc::BINOP_XOR;
   }
 }
 
 static unsigned getEncodedRMWOperation(AtomicRMWInst::BinOp Op) {
   switch (Op) {
-  default: llvm_unreachable("Unknown RMW operation!");
-  case AtomicRMWInst::Xchg: return bitc::RMW_XCHG;
-  case AtomicRMWInst::Add: return bitc::RMW_ADD;
-  case AtomicRMWInst::Sub: return bitc::RMW_SUB;
-  case AtomicRMWInst::And: return bitc::RMW_AND;
-  case AtomicRMWInst::Nand: return bitc::RMW_NAND;
-  case AtomicRMWInst::Or: return bitc::RMW_OR;
-  case AtomicRMWInst::Xor: return bitc::RMW_XOR;
-  case AtomicRMWInst::Max: return bitc::RMW_MAX;
-  case AtomicRMWInst::Min: return bitc::RMW_MIN;
-  case AtomicRMWInst::UMax: return bitc::RMW_UMAX;
-  case AtomicRMWInst::UMin: return bitc::RMW_UMIN;
-  case AtomicRMWInst::FAdd: return bitc::RMW_FADD;
-  case AtomicRMWInst::FSub: return bitc::RMW_FSUB;
+  default:
+    llvm_unreachable("Unknown RMW operation!");
+  case AtomicRMWInst::Xchg:
+    return bitc::RMW_XCHG;
+  case AtomicRMWInst::Add:
+    return bitc::RMW_ADD;
+  case AtomicRMWInst::Sub:
+    return bitc::RMW_SUB;
+  case AtomicRMWInst::And:
+    return bitc::RMW_AND;
+  case AtomicRMWInst::Nand:
+    return bitc::RMW_NAND;
+  case AtomicRMWInst::Or:
+    return bitc::RMW_OR;
+  case AtomicRMWInst::Xor:
+    return bitc::RMW_XOR;
+  case AtomicRMWInst::Max:
+    return bitc::RMW_MAX;
+  case AtomicRMWInst::Min:
+    return bitc::RMW_MIN;
+  case AtomicRMWInst::UMax:
+    return bitc::RMW_UMAX;
+  case AtomicRMWInst::UMin:
+    return bitc::RMW_UMIN;
+  case AtomicRMWInst::FAdd:
+    return bitc::RMW_FADD;
+  case AtomicRMWInst::FSub:
+    return bitc::RMW_FSUB;
   }
 }
 
 static unsigned getEncodedOrdering(AtomicOrdering Ordering) {
   switch (Ordering) {
-  case AtomicOrdering::NotAtomic: return bitc::ORDERING_NOTATOMIC;
-  case AtomicOrdering::Unordered: return bitc::ORDERING_UNORDERED;
-  case AtomicOrdering::Monotonic: return bitc::ORDERING_MONOTONIC;
-  case AtomicOrdering::Acquire: return bitc::ORDERING_ACQUIRE;
-  case AtomicOrdering::Release: return bitc::ORDERING_RELEASE;
-  case AtomicOrdering::AcquireRelease: return bitc::ORDERING_ACQREL;
-  case AtomicOrdering::SequentiallyConsistent: return bitc::ORDERING_SEQCST;
+  case AtomicOrdering::NotAtomic:
+    return bitc::ORDERING_NOTATOMIC;
+  case AtomicOrdering::Unordered:
+    return bitc::ORDERING_UNORDERED;
+  case AtomicOrdering::Monotonic:
+    return bitc::ORDERING_MONOTONIC;
+  case AtomicOrdering::Acquire:
+    return bitc::ORDERING_ACQUIRE;
+  case AtomicOrdering::Release:
+    return bitc::ORDERING_RELEASE;
+  case AtomicOrdering::AcquireRelease:
+    return bitc::ORDERING_ACQREL;
+  case AtomicOrdering::SequentiallyConsistent:
+    return bitc::ORDERING_SEQCST;
   }
   llvm_unreachable("Invalid ordering");
 }
@@ -746,7 +793,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
 void ModuleBitcodeWriter::writeAttributeGroupTable() {
   const std::vector<ValueEnumerator::IndexAndAttrSet> &AttrGrps =
       VE.getAttributeGroups();
-  if (AttrGrps.empty()) return;
+  if (AttrGrps.empty())
+    return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3);
 
@@ -795,7 +843,8 @@ void ModuleBitcodeWriter::writeAttributeGroupTable() {
 
 void ModuleBitcodeWriter::writeAttributeTable() {
   const std::vector<AttributeList> &Attrs = VE.getAttributeLists();
-  if (Attrs.empty()) return;
+  if (Attrs.empty())
+    return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
 
@@ -828,13 +877,13 @@ void ModuleBitcodeWriter::writeTypeTable() {
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_POINTER));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
-  Abbv->Add(BitCodeAbbrevOp(0));  // Addrspace = 0
+  Abbv->Add(BitCodeAbbrevOp(0)); // Addrspace = 0
   unsigned PtrAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for TYPE_CODE_FUNCTION.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_FUNCTION));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // isvararg
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // isvararg
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
   unsigned FunctionAbbrev = Stream.EmitAbbrev(std::move(Abbv));
@@ -842,7 +891,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
   // Abbrev for TYPE_CODE_STRUCT_ANON.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_ANON));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
   unsigned StructAnonAbbrev = Stream.EmitAbbrev(std::move(Abbv));
@@ -857,7 +906,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
   // Abbrev for TYPE_CODE_STRUCT_NAMED.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_STRUCT_NAMED));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1));  // ispacked
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // ispacked
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
   unsigned StructNamedAbbrev = Stream.EmitAbbrev(std::move(Abbv));
@@ -865,7 +914,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
   // Abbrev for TYPE_CODE_ARRAY.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::TYPE_CODE_ARRAY));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // size
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // size
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, NumBits));
   unsigned ArrayAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
@@ -881,17 +930,39 @@ void ModuleBitcodeWriter::writeTypeTable() {
     unsigned Code = 0;
 
     switch (T->getTypeID()) {
-    case Type::VoidTyID:      Code = bitc::TYPE_CODE_VOID;      break;
-    case Type::HalfTyID:      Code = bitc::TYPE_CODE_HALF;      break;
-    case Type::FloatTyID:     Code = bitc::TYPE_CODE_FLOAT;     break;
-    case Type::DoubleTyID:    Code = bitc::TYPE_CODE_DOUBLE;    break;
-    case Type::X86_FP80TyID:  Code = bitc::TYPE_CODE_X86_FP80;  break;
-    case Type::FP128TyID:     Code = bitc::TYPE_CODE_FP128;     break;
-    case Type::PPC_FP128TyID: Code = bitc::TYPE_CODE_PPC_FP128; break;
-    case Type::LabelTyID:     Code = bitc::TYPE_CODE_LABEL;     break;
-    case Type::MetadataTyID:  Code = bitc::TYPE_CODE_METADATA;  break;
-    case Type::X86_MMXTyID:   Code = bitc::TYPE_CODE_X86_MMX;   break;
-    case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
+    case Type::VoidTyID:
+      Code = bitc::TYPE_CODE_VOID;
+      break;
+    case Type::HalfTyID:
+      Code = bitc::TYPE_CODE_HALF;
+      break;
+    case Type::FloatTyID:
+      Code = bitc::TYPE_CODE_FLOAT;
+      break;
+    case Type::DoubleTyID:
+      Code = bitc::TYPE_CODE_DOUBLE;
+      break;
+    case Type::X86_FP80TyID:
+      Code = bitc::TYPE_CODE_X86_FP80;
+      break;
+    case Type::FP128TyID:
+      Code = bitc::TYPE_CODE_FP128;
+      break;
+    case Type::PPC_FP128TyID:
+      Code = bitc::TYPE_CODE_PPC_FP128;
+      break;
+    case Type::LabelTyID:
+      Code = bitc::TYPE_CODE_LABEL;
+      break;
+    case Type::MetadataTyID:
+      Code = bitc::TYPE_CODE_METADATA;
+      break;
+    case Type::X86_MMXTyID:
+      Code = bitc::TYPE_CODE_X86_MMX;
+      break;
+    case Type::TokenTyID:
+      Code = bitc::TYPE_CODE_TOKEN;
+      break;
     case Type::IntegerTyID:
       // INTEGER: [width]
       Code = bitc::TYPE_CODE_INTEGER;
@@ -904,7 +975,8 @@ void ModuleBitcodeWriter::writeTypeTable() {
       TypeVals.push_back(VE.getTypeID(PTy->getElementType()));
       unsigned AddressSpace = PTy->getAddressSpace();
       TypeVals.push_back(AddressSpace);
-      if (AddressSpace == 0) AbbrevToUse = PtrAbbrev;
+      if (AddressSpace == 0)
+        AbbrevToUse = PtrAbbrev;
       break;
     }
     case Type::FunctionTyID: {
@@ -924,7 +996,8 @@ void ModuleBitcodeWriter::writeTypeTable() {
       TypeVals.push_back(ST->isPacked());
       // Output all of the element types.
       for (StructType::element_iterator I = ST->element_begin(),
-           E = ST->element_end(); I != E; ++I)
+                                        E = ST->element_end();
+           I != E; ++I)
         TypeVals.push_back(VE.getTypeID(*I));
 
       if (ST->isLiteral()) {
@@ -1041,29 +1114,40 @@ static uint64_t getEncodedGVarFlags(GlobalVarSummary::GVarFlags Flags) {
 
 static unsigned getEncodedVisibility(const GlobalValue &GV) {
   switch (GV.getVisibility()) {
-  case GlobalValue::DefaultVisibility:   return 0;
-  case GlobalValue::HiddenVisibility:    return 1;
-  case GlobalValue::ProtectedVisibility: return 2;
+  case GlobalValue::DefaultVisibility:
+    return 0;
+  case GlobalValue::HiddenVisibility:
+    return 1;
+  case GlobalValue::ProtectedVisibility:
+    return 2;
   }
   llvm_unreachable("Invalid visibility");
 }
 
 static unsigned getEncodedDLLStorageClass(const GlobalValue &GV) {
   switch (GV.getDLLStorageClass()) {
-  case GlobalValue::DefaultStorageClass:   return 0;
-  case GlobalValue::DLLImportStorageClass: return 1;
-  case GlobalValue::DLLExportStorageClass: return 2;
+  case GlobalValue::DefaultStorageClass:
+    return 0;
+  case GlobalValue::DLLImportStorageClass:
+    return 1;
+  case GlobalValue::DLLExportStorageClass:
+    return 2;
   }
   llvm_unreachable("Invalid DLL storage class");
 }
 
 static unsigned getEncodedThreadLocalMode(const GlobalValue &GV) {
   switch (GV.getThreadLocalMode()) {
-    case GlobalVariable::NotThreadLocal:         return 0;
-    case GlobalVariable::GeneralDynamicTLSModel: return 1;
-    case GlobalVariable::LocalDynamicTLSModel:   return 2;
-    case GlobalVariable::InitialExecTLSModel:    return 3;
-    case GlobalVariable::LocalExecTLSModel:      return 4;
+  case GlobalVariable::NotThreadLocal:
+    return 0;
+  case GlobalVariable::GeneralDynamicTLSModel:
+    return 1;
+  case GlobalVariable::LocalDynamicTLSModel:
+    return 2;
+  case GlobalVariable::InitialExecTLSModel:
+    return 3;
+  case GlobalVariable::LocalExecTLSModel:
+    return 4;
   }
   llvm_unreachable("Invalid TLS model");
 }
@@ -1086,9 +1170,12 @@ static unsigned getEncodedComdatSelectionKind(const Comdat &C) {
 
 static unsigned getEncodedUnnamedAddr(const GlobalValue &GV) {
   switch (GV.getUnnamedAddr()) {
-  case GlobalValue::UnnamedAddr::None:   return 0;
-  case GlobalValue::UnnamedAddr::Local:  return 2;
-  case GlobalValue::UnnamedAddr::Global: return 1;
+  case GlobalValue::UnnamedAddr::None:
+    return 0;
+  case GlobalValue::UnnamedAddr::Local:
+    return 2;
+  case GlobalValue::UnnamedAddr::Global:
+    return 1;
   }
   llvm_unreachable("Invalid unnamed_addr");
 }
@@ -1182,8 +1269,8 @@ void ModuleBitcodeWriter::writeModuleInfo() {
       // Give section names unique ID's.
       unsigned &Entry = SectionMap[GV.getSection()];
       if (!Entry) {
-        writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME, GV.getSection(),
-                          0 /*TODO*/);
+        writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME,
+                          GV.getSection(), 0 /*TODO*/);
         Entry = SectionMap.size();
       }
     }
@@ -1219,7 +1306,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                              Log2_32_Ceil(MaxGlobalType+1)));
+                              Log2_32_Ceil(MaxGlobalType + 1)));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // AddrSpace << 2
                                                            //| explicitType << 1
                                                            //| constant
@@ -1228,15 +1315,15 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     if (MaxAlignment == 0)                                 // Alignment.
       Abbv->Add(BitCodeAbbrevOp(0));
     else {
-      unsigned MaxEncAlignment = Log2_32(MaxAlignment)+1;
+      unsigned MaxEncAlignment = Log2_32(MaxAlignment) + 1;
       Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                               Log2_32_Ceil(MaxEncAlignment+1)));
+                                Log2_32_Ceil(MaxEncAlignment + 1)));
     }
-    if (SectionMap.empty())                                    // Section.
+    if (SectionMap.empty()) // Section.
       Abbv->Add(BitCodeAbbrevOp(0));
     else
       Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,
-                               Log2_32_Ceil(SectionMap.size()+1)));
+                                Log2_32_Ceil(SectionMap.size() + 1)));
     // Don't bother emitting vis + thread local.
     SimpleGVarAbbrev = Stream.EmitAbbrev(std::move(Abbv));
   }
@@ -1278,19 +1365,17 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Vals.push_back(GV.getName().size());
     Vals.push_back(VE.getTypeID(GV.getValueType()));
     Vals.push_back(GV.getType()->getAddressSpace() << 2 | 2 | GV.isConstant());
-    Vals.push_back(GV.isDeclaration() ? 0 :
-                   (VE.getValueID(GV.getInitializer()) + 1));
+    Vals.push_back(
+        GV.isDeclaration() ? 0 : (VE.getValueID(GV.getInitializer()) + 1));
     Vals.push_back(getEncodedLinkage(GV));
-    Vals.push_back(Log2_32(GV.getAlignment())+1);
+    Vals.push_back(Log2_32(GV.getAlignment()) + 1);
     Vals.push_back(GV.hasSection() ? SectionMap[GV.getSection()] : 0);
     if (GV.isThreadLocal() ||
         GV.getVisibility() != GlobalValue::DefaultVisibility ||
         GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
         GV.isExternallyInitialized() ||
         GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass ||
-        GV.hasComdat() ||
-        GV.hasAttributes() ||
-        GV.isDSOLocal() ||
+        GV.hasComdat() || GV.hasAttributes() || GV.isDSOLocal() ||
         GV.hasPartition()) {
       Vals.push_back(getEncodedVisibility(GV));
       Vals.push_back(getEncodedThreadLocalMode(GV));
@@ -1326,13 +1411,13 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Vals.push_back(F.isDeclaration());
     Vals.push_back(getEncodedLinkage(F));
     Vals.push_back(VE.getAttributeListID(F.getAttributes()));
-    Vals.push_back(Log2_32(F.getAlignment())+1);
+    Vals.push_back(Log2_32(F.getAlignment()) + 1);
     Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0);
     Vals.push_back(getEncodedVisibility(F));
     Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0);
     Vals.push_back(getEncodedUnnamedAddr(F));
-    Vals.push_back(F.hasPrologueData() ? (VE.getValueID(F.getPrologueData()) + 1)
-                                       : 0);
+    Vals.push_back(
+        F.hasPrologueData() ? (VE.getValueID(F.getPrologueData()) + 1) : 0);
     Vals.push_back(getEncodedDLLStorageClass(F));
     Vals.push_back(F.hasComdat() ? VE.getComdatID(F.getComdat()) : 0);
     Vals.push_back(F.hasPrefixData() ? (VE.getValueID(F.getPrefixData()) + 1)
@@ -1864,9 +1949,9 @@ void ModuleBitcodeWriter::writeDILocalVariable(
   Record.clear();
 }
 
-void ModuleBitcodeWriter::writeDILabel(
-    const DILabel *N, SmallVectorImpl<uint64_t> &Record,
-    unsigned Abbrev) {
+void ModuleBitcodeWriter::writeDILabel(const DILabel *N,
+                                       SmallVectorImpl<uint64_t> &Record,
+                                       unsigned Abbrev) {
   Record.push_back((uint64_t)N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getScope()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
@@ -2016,7 +2101,7 @@ void ModuleBitcodeWriter::writeMetadataRecords(
   if (MDs.empty())
     return;
 
-  // Initialize MDNode abbreviations.
+    // Initialize MDNode abbreviations.
 #define HANDLE_MDNODE_LEAF(CLASS) unsigned CLASS##Abbrev = 0;
 #include "llvm/IR/Metadata.def"
 
@@ -2181,7 +2266,8 @@ void ModuleBitcodeWriter::writeFunctionMetadataAttachment(const Function &F) {
       I.getAllMetadataOtherThanDebugLoc(MDs);
 
       // If no metadata, ignore instruction.
-      if (MDs.empty()) continue;
+      if (MDs.empty())
+        continue;
 
       Record.push_back(VE.getInstructionID(&I));
 
@@ -2204,7 +2290,8 @@ void ModuleBitcodeWriter::writeModuleMetadataKinds() {
   SmallVector<StringRef, 8> Names;
   M.getMDKindNames(Names);
 
-  if (Names.empty()) return;
+  if (Names.empty())
+    return;
 
   Stream.EnterSubblock(bitc::METADATA_KIND_BLOCK_ID, 3);
 
@@ -2274,7 +2361,8 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
 
 void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
                                          bool isGlobal) {
-  if (FirstVal == LastVal) return;
+  if (FirstVal == LastVal)
+    return;
 
   Stream.EnterSubblock(bitc::CONSTANTS_BLOCK_ID, 4);
 
@@ -2288,7 +2376,8 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_AGGREGATE));
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal+1)));
+    Abbv->Add(
+        BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, Log2_32_Ceil(LastVal + 1)));
     AggregateAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
     // Abbrev for CST_CODE_STRING.
@@ -2329,7 +2418,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
     if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
       Record.push_back(unsigned(IA->hasSideEffects()) |
                        unsigned(IA->isAlignStack()) << 1 |
-                       unsigned(IA->getDialect()&1) << 2);
+                       unsigned(IA->getDialect() & 1) << 2);
 
       // Add the asm string.
       const std::string &AsmStr = IA->getAsmString();
@@ -2357,7 +2446,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
         emitSignedInt64(Record, V);
         Code = bitc::CST_CODE_INTEGER;
         AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
-      } else {                             // Wide integers, > 64 bits in size.
+      } else { // Wide integers, > 64 bits in size.
         // We have an arbitrary precision integer value to write whose
         // bit width is > 64. However, in canonical unsigned integer
         // format it is likely that the high bits are going to be zero.
@@ -2397,7 +2486,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       // If this is a null-terminated string, use the denser CSTRING encoding.
       if (Str->isCString()) {
         Code = bitc::CST_CODE_CSTRING;
-        --NumElts;  // Don't encode the null, which isn't allowed by char6.
+        --NumElts; // Don't encode the null, which isn't allowed by char6.
       } else {
         Code = bitc::CST_CODE_STRING;
         AbbrevToUse = String8Abbrev;
@@ -2417,7 +2506,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       else if (isCStr7)
         AbbrevToUse = CString7Abbrev;
     } else if (const ConstantDataSequential *CDS =
-                  dyn_cast<ConstantDataSequential>(C)) {
+                   dyn_cast<ConstantDataSequential>(C)) {
       Code = bitc::CST_CODE_DATA;
       Type *EltTy = CDS->getType()->getElementType();
       if (isa<IntegerType>(EltTy)) {
@@ -2713,45 +2802,39 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     break;
   }
 
-  case Instruction::Ret:
-    {
-      Code = bitc::FUNC_CODE_INST_RET;
-      unsigned NumOperands = I.getNumOperands();
-      if (NumOperands == 0)
-        AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV;
-      else if (NumOperands == 1) {
-        if (!pushValueAndType(I.getOperand(0), InstID, Vals))
-          AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV;
-      } else {
-        for (unsigned i = 0, e = NumOperands; i != e; ++i)
-          pushValueAndType(I.getOperand(i), InstID, Vals);
-      }
+  case Instruction::Ret: {
+    Code = bitc::FUNC_CODE_INST_RET;
+    unsigned NumOperands = I.getNumOperands();
+    if (NumOperands == 0)
+      AbbrevToUse = FUNCTION_INST_RET_VOID_ABBREV;
+    else if (NumOperands == 1) {
+      if (!pushValueAndType(I.getOperand(0), InstID, Vals))
+        AbbrevToUse = FUNCTION_INST_RET_VAL_ABBREV;
+    } else {
+      for (unsigned i = 0, e = NumOperands; i != e; ++i)
+        pushValueAndType(I.getOperand(i), InstID, Vals);
     }
-    break;
-  case Instruction::Br:
-    {
-      Code = bitc::FUNC_CODE_INST_BR;
-      const BranchInst &II = cast<BranchInst>(I);
-      Vals.push_back(VE.getValueID(II.getSuccessor(0)));
-      if (II.isConditional()) {
-        Vals.push_back(VE.getValueID(II.getSuccessor(1)));
-        pushValue(II.getCondition(), InstID, Vals);
-      }
+  } break;
+  case Instruction::Br: {
+    Code = bitc::FUNC_CODE_INST_BR;
+    const BranchInst &II = cast<BranchInst>(I);
+    Vals.push_back(VE.getValueID(II.getSuccessor(0)));
+    if (II.isConditional()) {
+      Vals.push_back(VE.getValueID(II.getSuccessor(1)));
+      pushValue(II.getCondition(), InstID, Vals);
     }
-    break;
-  case Instruction::Switch:
-    {
-      Code = bitc::FUNC_CODE_INST_SWITCH;
-      const SwitchInst &SI = cast<SwitchInst>(I);
-      Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      pushValue(SI.getCondition(), InstID, Vals);
-      Vals.push_back(VE.getValueID(SI.getDefaultDest()));
-      for (auto Case : SI.cases()) {
-        Vals.push_back(VE.getValueID(Case.getCaseValue()));
-        Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
-      }
+  } break;
+  case Instruction::Switch: {
+    Code = bitc::FUNC_CODE_INST_SWITCH;
+    const SwitchInst &SI = cast<SwitchInst>(I);
+    Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
+    pushValue(SI.getCondition(), InstID, Vals);
+    Vals.push_back(VE.getValueID(SI.getDefaultDest()));
+    for (auto Case : SI.cases()) {
+      Vals.push_back(VE.getValueID(Case.getCaseValue()));
+      Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
     }
-    break;
+  } break;
   case Instruction::IndirectBr:
     Code = bitc::FUNC_CODE_INST_INDIRECTBR;
     Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
@@ -2938,7 +3021,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
         AbbrevToUse = FUNCTION_INST_LOAD_ABBREV;
     }
     Vals.push_back(VE.getTypeID(I.getType()));
-    Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment())+1);
+    Vals.push_back(Log2_32(cast<LoadInst>(I).getAlignment()) + 1);
     Vals.push_back(cast<LoadInst>(I).isVolatile());
     if (cast<LoadInst>(I).isAtomic()) {
       Vals.push_back(getEncodedOrdering(cast<LoadInst>(I).getOrdering()));
@@ -2952,7 +3035,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       Code = bitc::FUNC_CODE_INST_STORE;
     pushValueAndType(I.getOperand(1), InstID, Vals); // ptrty + ptr
     pushValueAndType(I.getOperand(0), InstID, Vals); // valty + val
-    Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
+    Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment()) + 1);
     Vals.push_back(cast<StoreInst>(I).isVolatile());
     if (cast<StoreInst>(I).isAtomic()) {
       Vals.push_back(getEncodedOrdering(cast<StoreInst>(I).getOrdering()));
@@ -3025,17 +3108,17 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
-      for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands();
-           i != e; ++i)
+      for (unsigned i = FTy->getNumParams(), e = CI.getNumArgOperands(); i != e;
+           ++i)
         pushValueAndType(CI.getArgOperand(i), InstID, Vals); // varargs
     }
     break;
   }
   case Instruction::VAArg:
     Code = bitc::FUNC_CODE_INST_VAARG;
-    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));   // valistty
-    pushValue(I.getOperand(0), InstID, Vals);                   // valist.
-    Vals.push_back(VE.getTypeID(I.getType())); // restype.
+    Vals.push_back(VE.getTypeID(I.getOperand(0)->getType())); // valistty
+    pushValue(I.getOperand(0), InstID, Vals);                 // valist.
+    Vals.push_back(VE.getTypeID(I.getType()));                // restype.
     break;
   }
 
@@ -3046,7 +3129,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 /// Write a GlobalValue VST to the module. The purpose of this data structure is
 /// to allow clients to efficiently find the function body.
 void ModuleBitcodeWriter::writeGlobalValueSymbolTable(
-  DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex) {
+    DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex) {
   // Get the offset of the VST we are writing, and backpatch it into
   // the VST forward declaration record.
   uint64_t VSTOffset = Stream.GetCurrentBitNo();
@@ -3201,8 +3284,8 @@ void ModuleBitcodeWriter::writeFunction(
   DILocation *LastDL = nullptr;
   // Finally, emit all the instructions, in order.
   for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E;
+         ++I) {
       writeInstruction(*I, InstID, Vals);
 
       if (!I->getType()->isVoidTy())
@@ -3316,10 +3399,10 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   { // CE_CAST abbrev for CONSTANTS_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // cast opc
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // typeid
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // cast opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,      // typeid
                               VE.computeBitsRequiredForTypeIndicies()));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));    // value id
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // value id
 
     if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
         CONSTANTS_CE_CAST_Abbrev)
@@ -3341,7 +3424,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // Ptr
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
                               VE.computeBitsRequiredForTypeIndicies()));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // Align
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // Align
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // volatile
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_LOAD_ABBREV)
@@ -3350,7 +3433,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   { // INST_UNOP abbrev for FUNCTION_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_UNOP_ABBREV)
@@ -3359,7 +3442,7 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   { // INST_UNOP_FLAGS abbrev for FUNCTION_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_UNOP));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
@@ -3369,8 +3452,8 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   { // INST_BINOP abbrev for FUNCTION_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // RHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_BINOP_ABBREV)
@@ -3379,8 +3462,8 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   { // INST_BINOP_FLAGS abbrev for FUNCTION_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_BINOP));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // LHS
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // RHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // LHS
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // RHS
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
@@ -3390,10 +3473,10 @@ void ModuleBitcodeWriter::writeBlockInfo() {
   { // INST_CAST abbrev for FUNCTION_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));    // OpVal
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,       // dest ty
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
                               VE.computeBitsRequiredForTypeIndicies()));
-    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4));  // opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
     if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
         FUNCTION_INST_CAST_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
@@ -3764,13 +3847,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   // Abbrev for FS_PERMODULE_PROFILE.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_PROFILE));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // rorefcnt
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // worefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3782,13 +3865,13 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE_RELBF));
   else
     Abbv->Add(BitCodeAbbrevOp(bitc::FS_PERMODULE));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // rorefcnt
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // worefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt
   // numrefs x valueid, n x (valueid [, rel_block_freq])
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3817,9 +3900,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
   // Abbrev for FS_ALIAS.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_ALIAS));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
   unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_TYPE_ID_METADATA
@@ -3913,15 +3996,15 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // Abbrev for FS_COMBINED.
   auto Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // entrycount
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // rorefcnt
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // worefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt
   // numrefs x valueid, n x (valueid)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3930,15 +4013,15 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // Abbrev for FS_COMBINED_PROFILE.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_PROFILE));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // instcount
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // fflags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // entrycount
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // numrefs
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // rorefcnt
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4));   // worefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // instcount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // fflags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // entrycount
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // numrefs
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // rorefcnt
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 4)); // worefcnt
   // numrefs x valueid, n x (valueid, hotness)
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
@@ -3947,20 +4030,20 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
   // Abbrev for FS_COMBINED_GLOBALVAR_INIT_REFS.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_GLOBALVAR_INIT_REFS));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));    // valueids
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Array));  // valueids
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
   unsigned FSModRefsAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // Abbrev for FS_COMBINED_ALIAS.
   Abbv = std::make_shared<BitCodeAbbrev>();
   Abbv->Add(BitCodeAbbrevOp(bitc::FS_COMBINED_ALIAS));
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // modid
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));   // flags
-  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));   // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // modid
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // flags
+  Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8)); // valueid
   unsigned FSAliasAbbrev = Stream.EmitAbbrev(std::move(Abbv));
 
   // The aliases are emitted as a post-pass, and will point to the value
@@ -4311,10 +4394,10 @@ static void emitDarwinBCHeaderAndTrailer(SmallVectorImpl<char> &Buffer,
   // number from /usr/include/mach/machine.h.  It is ok to reproduce the
   // specific constants here because they are implicitly part of the Darwin ABI.
   enum {
-    DARWIN_CPU_ARCH_ABI64      = 0x01000000,
-    DARWIN_CPU_TYPE_X86        = 7,
-    DARWIN_CPU_TYPE_ARM        = 12,
-    DARWIN_CPU_TYPE_POWERPC    = 18
+    DARWIN_CPU_ARCH_ABI64 = 0x01000000,
+    DARWIN_CPU_TYPE_X86 = 7,
+    DARWIN_CPU_TYPE_ARM = 12,
+    DARWIN_CPU_TYPE_POWERPC = 18
   };
 
   Triple::ArchType Arch = TT.getArch();
@@ -4463,7 +4546,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
                               const ModuleSummaryIndex *Index,
                               bool GenerateHash, ModuleHash *ModHash) {
   SmallVector<char, 0> Buffer;
-  Buffer.reserve(256*1024);
+  Buffer.reserve(256 * 1024);
 
   // If this is darwin or another generic macho target, reserve space for the
   // header.
@@ -4481,7 +4564,7 @@ void llvm::WriteBitcodeToFile(const Module &M, raw_ostream &Out,
     emitDarwinBCHeaderAndTrailer(Buffer, TT);
 
   // Write the generated bitstream to "Out".
-  Out.write((char*)&Buffer.front(), Buffer.size());
+  Out.write((char *)&Buffer.front(), Buffer.size());
 }
 
 void IndexBitcodeWriter::write() {
diff --git a/hpvm/llvm_patches/lib/IR/Attributes.cpp b/hpvm/llvm_patches/lib/IR/Attributes.cpp
index 264d0fe498b7f424686017d12759bf1d7d0ae2f6..3cc95b3102fdf6c7062fffe1f9486cfa094bba9b 100644
--- a/hpvm/llvm_patches/lib/IR/Attributes.cpp
+++ b/hpvm/llvm_patches/lib/IR/Attributes.cpp
@@ -82,7 +82,8 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
   LLVMContextImpl *pImpl = Context.pImpl;
   FoldingSetNodeID ID;
   ID.AddInteger(Kind);
-  if (Val) ID.AddInteger(Val);
+  if (Val)
+    ID.AddInteger(Val);
 
   void *InsertPoint;
   AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
@@ -105,7 +106,8 @@ Attribute Attribute::get(LLVMContext &Context, StringRef Kind, StringRef Val) {
   LLVMContextImpl *pImpl = Context.pImpl;
   FoldingSetNodeID ID;
   ID.AddString(Kind);
-  if (!Val.empty()) ID.AddString(Val);
+  if (!Val.empty())
+    ID.AddString(Val);
 
   void *InsertPoint;
   AttributeImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
@@ -156,7 +158,7 @@ Attribute Attribute::getWithStackAlignment(LLVMContext &Context,
 }
 
 Attribute Attribute::getWithDereferenceableBytes(LLVMContext &Context,
-                                                uint64_t Bytes) {
+                                                 uint64_t Bytes) {
   assert(Bytes && "Bytes must be non-zero.");
   return get(Context, Dereferenceable, Bytes);
 }
@@ -200,47 +202,52 @@ bool Attribute::isTypeAttribute() const {
 }
 
 Attribute::AttrKind Attribute::getKindAsEnum() const {
-  if (!pImpl) return None;
+  if (!pImpl)
+    return None;
   assert((isEnumAttribute() || isIntAttribute() || isTypeAttribute()) &&
          "Invalid attribute type to get the kind as an enum!");
   return pImpl->getKindAsEnum();
 }
 
 uint64_t Attribute::getValueAsInt() const {
-  if (!pImpl) return 0;
+  if (!pImpl)
+    return 0;
   assert(isIntAttribute() &&
          "Expected the attribute to be an integer attribute!");
   return pImpl->getValueAsInt();
 }
 
 StringRef Attribute::getKindAsString() const {
-  if (!pImpl) return {};
+  if (!pImpl)
+    return {};
   assert(isStringAttribute() &&
          "Invalid attribute type to get the kind as a string!");
   return pImpl->getKindAsString();
 }
 
 StringRef Attribute::getValueAsString() const {
-  if (!pImpl) return {};
+  if (!pImpl)
+    return {};
   assert(isStringAttribute() &&
          "Invalid attribute type to get the value as a string!");
   return pImpl->getValueAsString();
 }
 
 Type *Attribute::getValueAsType() const {
-  if (!pImpl) return {};
+  if (!pImpl)
+    return {};
   assert(isTypeAttribute() &&
          "Invalid attribute type to get the value as a type!");
   return pImpl->getValueAsType();
 }
 
-
 bool Attribute::hasAttribute(AttrKind Kind) const {
   return (pImpl && pImpl->hasAttribute(Kind)) || (!pImpl && Kind == None);
 }
 
 bool Attribute::hasAttribute(StringRef Kind) const {
-  if (!isStringAttribute()) return false;
+  if (!isStringAttribute())
+    return false;
   return pImpl && pImpl->hasAttribute(Kind);
 }
 
@@ -277,7 +284,8 @@ std::pair<unsigned, Optional<unsigned>> Attribute::getAllocSizeArgs() const {
 }
 
 std::string Attribute::getAsString(bool InAttrGrp) const {
-  if (!pImpl) return {};
+  if (!pImpl)
+    return {};
 
   if (hasAttribute(Attribute::SanitizeAddress))
     return "sanitize_address";
@@ -478,7 +486,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     Result += (Twine('"') + getKindAsString() + Twine('"')).str();
 
     std::string AttrVal = pImpl->getValueAsString();
-    if (AttrVal.empty()) return Result;
+    if (AttrVal.empty())
+      return Result;
 
     // Since some attribute strings contain special characters that cannot be
     // printable, those have to be escaped to make the attribute value printable
@@ -496,9 +505,12 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
 }
 
 bool Attribute::operator<(Attribute A) const {
-  if (!pImpl && !A.pImpl) return false;
-  if (!pImpl) return true;
-  if (!A.pImpl) return false;
+  if (!pImpl && !A.pImpl)
+    return false;
+  if (!pImpl)
+    return true;
+  if (!A.pImpl)
+    return false;
   return *pImpl < *A.pImpl;
 }
 
@@ -518,12 +530,14 @@ void StringAttributeImpl::anchor() {}
 void TypeAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
-  if (isStringAttribute()) return false;
+  if (isStringAttribute())
+    return false;
   return getKindAsEnum() == A;
 }
 
 bool AttributeImpl::hasAttribute(StringRef Kind) const {
-  if (!isStringAttribute()) return false;
+  if (!isStringAttribute())
+    return false;
   return getKindAsString() == Kind;
 }
 
@@ -556,38 +570,51 @@ bool AttributeImpl::operator<(const AttributeImpl &AI) const {
   // This sorts the attributes with Attribute::AttrKinds coming first (sorted
   // relative to their enum value) and then strings.
   if (isEnumAttribute()) {
-    if (AI.isEnumAttribute()) return getKindAsEnum() < AI.getKindAsEnum();
-    if (AI.isIntAttribute()) return true;
-    if (AI.isStringAttribute()) return true;
-    if (AI.isTypeAttribute()) return true;
+    if (AI.isEnumAttribute())
+      return getKindAsEnum() < AI.getKindAsEnum();
+    if (AI.isIntAttribute())
+      return true;
+    if (AI.isStringAttribute())
+      return true;
+    if (AI.isTypeAttribute())
+      return true;
   }
 
   if (isTypeAttribute()) {
-    if (AI.isEnumAttribute()) return false;
+    if (AI.isEnumAttribute())
+      return false;
     if (AI.isTypeAttribute()) {
       assert(getKindAsEnum() != AI.getKindAsEnum() &&
              "Comparison of types would be unstable");
       return getKindAsEnum() < AI.getKindAsEnum();
     }
-    if (AI.isIntAttribute()) return true;
-    if (AI.isStringAttribute()) return true;
+    if (AI.isIntAttribute())
+      return true;
+    if (AI.isStringAttribute())
+      return true;
   }
 
   if (isIntAttribute()) {
-    if (AI.isEnumAttribute()) return false;
-    if (AI.isTypeAttribute()) return false;
+    if (AI.isEnumAttribute())
+      return false;
+    if (AI.isTypeAttribute())
+      return false;
     if (AI.isIntAttribute()) {
       if (getKindAsEnum() == AI.getKindAsEnum())
         return getValueAsInt() < AI.getValueAsInt();
       return getKindAsEnum() < AI.getKindAsEnum();
     }
-    if (AI.isStringAttribute()) return true;
+    if (AI.isStringAttribute())
+      return true;
   }
 
   assert(isStringAttribute());
-  if (AI.isEnumAttribute()) return false;
-  if (AI.isTypeAttribute()) return false;
-  if (AI.isIntAttribute()) return false;
+  if (AI.isEnumAttribute())
+    return false;
+  if (AI.isTypeAttribute())
+    return false;
+  if (AI.isIntAttribute())
+    return false;
   if (getKindAsString() == AI.getKindAsString())
     return getValueAsString() < AI.getValueAsString();
   return getKindAsString() < AI.getKindAsString();
@@ -607,7 +634,8 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
 
 AttributeSet AttributeSet::addAttribute(LLVMContext &C,
                                         Attribute::AttrKind Kind) const {
-  if (hasAttribute(Kind)) return *this;
+  if (hasAttribute(Kind))
+    return *this;
   AttrBuilder B;
   B.addAttribute(Kind);
   return addAttributes(C, AttributeSet::get(C, B));
@@ -632,27 +660,29 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C,
   for (const auto I : *this)
     B.addAttribute(I);
 
- return get(C, B);
+  return get(C, B);
 }
 
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
-                                             Attribute::AttrKind Kind) const {
-  if (!hasAttribute(Kind)) return *this;
+                                           Attribute::AttrKind Kind) const {
+  if (!hasAttribute(Kind))
+    return *this;
   AttrBuilder B(*this);
   B.removeAttribute(Kind);
   return get(C, B);
 }
 
 AttributeSet AttributeSet::removeAttribute(LLVMContext &C,
-                                             StringRef Kind) const {
-  if (!hasAttribute(Kind)) return *this;
+                                           StringRef Kind) const {
+  if (!hasAttribute(Kind))
+    return *this;
   AttrBuilder B(*this);
   B.removeAttribute(Kind);
   return get(C, B);
 }
 
 AttributeSet AttributeSet::removeAttributes(LLVMContext &C,
-                                              const AttrBuilder &Attrs) const {
+                                            const AttrBuilder &Attrs) const {
   AttrBuilder B(*this);
   B.remove(Attrs);
   return get(C, B);
@@ -718,8 +748,8 @@ AttributeSet::iterator AttributeSet::end() const {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void AttributeSet::dump() const {
   dbgs() << "AS =\n";
-    dbgs() << "  { ";
-    dbgs() << getAsString(true) << " }\n";
+  dbgs() << "  { ";
+  dbgs() << getAsString(true) << " }\n";
 }
 #endif
 
@@ -732,8 +762,7 @@ AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs)
   // There's memory after the node where we can store the entries in.
   llvm::copy(Attrs, getTrailingObjects<Attribute>());
 
-  static_assert(Attribute::EndAttrKinds <=
-                    sizeof(AvailableAttrs) * CHAR_BIT,
+  static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
                 "Too many attributes");
 
   for (const auto I : *this) {
@@ -761,7 +790,7 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
 
   void *InsertPoint;
   AttributeSetNode *PA =
-    pImpl->AttrsSetNodes.FindNodeOrInsertPos(ID, InsertPoint);
+      pImpl->AttrsSetNodes.FindNodeOrInsertPos(ID, InsertPoint);
 
   // If we didn't find any existing attributes of the same shape then create a
   // new one and insert it.
@@ -988,7 +1017,8 @@ AttributeList::get(LLVMContext &C,
                         [](const std::pair<unsigned, Attribute> &LHS,
                            const std::pair<unsigned, Attribute> &RHS) {
                           return LHS.first < RHS.first;
-                        }) && "Misordered Attributes list!");
+                        }) &&
+         "Misordered Attributes list!");
   assert(llvm::none_of(Attrs,
                        [](const std::pair<unsigned, Attribute> &Pair) {
                          return Pair.second.hasAttribute(Attribute::None);
@@ -999,7 +1029,8 @@ AttributeList::get(LLVMContext &C,
   // list.
   SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairVec;
   for (ArrayRef<std::pair<unsigned, Attribute>>::iterator I = Attrs.begin(),
-         E = Attrs.end(); I != E; ) {
+                                                          E = Attrs.end();
+       I != E;) {
     unsigned Index = I->first;
     SmallVector<Attribute, 4> AttrVec;
     while (I != E && I->first == Index) {
@@ -1140,7 +1171,8 @@ AttributeList AttributeList::get(LLVMContext &C,
 
 AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
                                           Attribute::AttrKind Kind) const {
-  if (hasAttribute(Index, Kind)) return *this;
+  if (hasAttribute(Index, Kind))
+    return *this;
   AttrBuilder B;
   B.addAttribute(Kind);
   return addAttributes(C, Index, B);
@@ -1212,7 +1244,8 @@ AttributeList AttributeList::addParamAttribute(LLVMContext &C,
 
 AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
                                              Attribute::AttrKind Kind) const {
-  if (!hasAttribute(Index, Kind)) return *this;
+  if (!hasAttribute(Index, Kind))
+    return *this;
 
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
@@ -1225,7 +1258,8 @@ AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
 
 AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
                                              StringRef Kind) const {
-  if (!hasAttribute(Index, Kind)) return *this;
+  if (!hasAttribute(Index, Kind))
+    return *this;
 
   Index = attrIdxToArrayIdx(Index);
   SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
@@ -1335,7 +1369,8 @@ bool AttributeList::hasParamAttribute(unsigned ArgNo,
 
 bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
                                      unsigned *Index) const {
-  if (!pImpl) return false;
+  if (!pImpl)
+    return false;
 
   for (unsigned I = index_begin(), E = index_end(); I != E; ++I) {
     if (hasAttribute(I, Attr)) {
@@ -1366,10 +1401,9 @@ unsigned AttributeList::getParamAlignment(unsigned ArgNo) const {
 }
 
 Type *AttributeList::getParamByValType(unsigned Index) const {
-  return getAttributes(Index+FirstArgIndex).getByValType();
+  return getAttributes(Index + FirstArgIndex).getByValType();
 }
 
-
 unsigned AttributeList::getStackAlignment(unsigned Index) const {
   return getAttributes(Index).getStackAlignment();
 }
@@ -1526,7 +1560,8 @@ std::pair<unsigned, Optional<unsigned>> AttrBuilder::getAllocSizeArgs() const {
 }
 
 AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
-  if (Align == 0) return *this;
+  if (Align == 0)
+    return *this;
 
   assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
   assert(Align <= 0x40000000 && "Alignment too large.");
@@ -1538,7 +1573,8 @@ AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
 
 AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) {
   // Default alignment, allow the target to define how to align it.
-  if (Align == 0) return *this;
+  if (Align == 0)
+    return *this;
 
   assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
   assert(Align <= 0x100 && "Alignment too large.");
@@ -1549,7 +1585,8 @@ AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align) {
 }
 
 AttrBuilder &AttrBuilder::addDereferenceableAttr(uint64_t Bytes) {
-  if (Bytes == 0) return *this;
+  if (Bytes == 0)
+    return *this;
 
   Attrs[Attribute::Dereferenceable] = true;
   DerefBytes = Bytes;
@@ -1680,16 +1717,14 @@ bool AttrBuilder::hasAttributes(AttributeList AL, uint64_t Index) const {
   return false;
 }
 
-bool AttrBuilder::hasAlignmentAttr() const {
-  return Alignment != 0;
-}
+bool AttrBuilder::hasAlignmentAttr() const { return Alignment != 0; }
 
 bool AttrBuilder::operator==(const AttrBuilder &B) {
   if (Attrs != B.Attrs)
     return false;
 
-  for (td_const_iterator I = TargetDepAttrs.begin(),
-         E = TargetDepAttrs.end(); I != E; ++I)
+  for (td_const_iterator I = TargetDepAttrs.begin(), E = TargetDepAttrs.end();
+       I != E; ++I)
     if (B.TargetDepAttrs.find(I->first) == B.TargetDepAttrs.end())
       return false;
 
@@ -1707,27 +1742,26 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
 
   if (!Ty->isIntegerTy())
     // Attribute that only apply to integers.
-    Incompatible.addAttribute(Attribute::SExt)
-      .addAttribute(Attribute::ZExt);
+    Incompatible.addAttribute(Attribute::SExt).addAttribute(Attribute::ZExt);
 
   if (!Ty->isPointerTy())
     // Attribute that only apply to pointers.
     Incompatible.addAttribute(Attribute::ByVal)
-      .addAttribute(Attribute::Nest)
-      .addAttribute(Attribute::NoAlias)
-      .addAttribute(Attribute::NoCapture)
-      .addAttribute(Attribute::NonNull)
-      .addDereferenceableAttr(1) // the int here is ignored
-      .addDereferenceableOrNullAttr(1) // the int here is ignored
-      .addAttribute(Attribute::ReadNone)
-      .addAttribute(Attribute::ReadOnly)
-      .addAttribute(Attribute::StructRet)
-      .addAttribute(Attribute::InAlloca);
+        .addAttribute(Attribute::Nest)
+        .addAttribute(Attribute::NoAlias)
+        .addAttribute(Attribute::NoCapture)
+        .addAttribute(Attribute::NonNull)
+        .addDereferenceableAttr(1)       // the int here is ignored
+        .addDereferenceableOrNullAttr(1) // the int here is ignored
+        .addAttribute(Attribute::ReadNone)
+        .addAttribute(Attribute::ReadOnly)
+        .addAttribute(Attribute::StructRet)
+        .addAttribute(Attribute::InAlloca);
 
   return Incompatible;
 }
 
-template<typename AttrClass>
+template <typename AttrClass>
 static bool isEqual(const Function &Caller, const Function &Callee) {
   return Caller.getFnAttribute(AttrClass::getKind()) ==
          Callee.getFnAttribute(AttrClass::getKind());
@@ -1738,7 +1772,7 @@ static bool isEqual(const Function &Caller, const Function &Callee) {
 ///
 /// This function sets the caller's attribute to false if the callee's attribute
 /// is false.
-template<typename AttrClass>
+template <typename AttrClass>
 static void setAND(Function &Caller, const Function &Callee) {
   if (AttrClass::isSet(Caller, AttrClass::getKind()) &&
       !AttrClass::isSet(Callee, AttrClass::getKind()))
@@ -1750,7 +1784,7 @@ static void setAND(Function &Caller, const Function &Callee) {
 ///
 /// This function sets the caller's attribute to true if the callee's attribute
 /// is true.
-template<typename AttrClass>
+template <typename AttrClass>
 static void setOR(Function &Caller, const Function &Callee) {
   if (!AttrClass::isSet(Caller, AttrClass::getKind()) &&
       AttrClass::isSet(Callee, AttrClass::getKind()))
@@ -1793,18 +1827,18 @@ static void adjustCallerStackProbes(Function &Caller, const Function &Callee) {
 /// If the inlined function defines the size of guard region
 /// on the stack, then ensure that the calling function defines a guard region
 /// that is no larger.
-static void
-adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
+static void adjustCallerStackProbeSize(Function &Caller,
+                                       const Function &Callee) {
   if (Callee.hasFnAttribute("stack-probe-size")) {
     uint64_t CalleeStackProbeSize;
     Callee.getFnAttribute("stack-probe-size")
-          .getValueAsString()
-          .getAsInteger(0, CalleeStackProbeSize);
+        .getValueAsString()
+        .getAsInteger(0, CalleeStackProbeSize);
     if (Caller.hasFnAttribute("stack-probe-size")) {
       uint64_t CallerStackProbeSize;
       Caller.getFnAttribute("stack-probe-size")
-            .getValueAsString()
-            .getAsInteger(0, CallerStackProbeSize);
+          .getValueAsString()
+          .getAsInteger(0, CallerStackProbeSize);
       if (CallerStackProbeSize > CalleeStackProbeSize) {
         Caller.addFnAttr(Callee.getFnAttribute("stack-probe-size"));
       }
@@ -1823,18 +1857,18 @@ adjustCallerStackProbeSize(Function &Caller, const Function &Callee) {
 /// to merge the attribute this way. Heuristics that would use
 /// min-legal-vector-width to determine inline compatibility would need to be
 /// handled as part of inline cost analysis.
-static void
-adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) {
+static void adjustMinLegalVectorWidth(Function &Caller,
+                                      const Function &Callee) {
   if (Caller.hasFnAttribute("min-legal-vector-width")) {
     if (Callee.hasFnAttribute("min-legal-vector-width")) {
       uint64_t CallerVectorWidth;
       Caller.getFnAttribute("min-legal-vector-width")
-            .getValueAsString()
-            .getAsInteger(0, CallerVectorWidth);
+          .getValueAsString()
+          .getAsInteger(0, CallerVectorWidth);
       uint64_t CalleeVectorWidth;
       Callee.getFnAttribute("min-legal-vector-width")
-            .getValueAsString()
-            .getAsInteger(0, CalleeVectorWidth);
+          .getValueAsString()
+          .getAsInteger(0, CalleeVectorWidth);
       if (CallerVectorWidth < CalleeVectorWidth)
         Caller.addFnAttr(Callee.getFnAttribute("min-legal-vector-width"));
     } else {
@@ -1847,8 +1881,8 @@ adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) {
 
 /// If the inlined function has "null-pointer-is-valid=true" attribute,
 /// set this attribute in the caller post inlining.
-static void
-adjustNullPointerValidAttr(Function &Caller, const Function &Callee) {
+static void adjustNullPointerValidAttr(Function &Caller,
+                                       const Function &Callee) {
   if (Callee.nullPointerIsDefined() && !Caller.nullPointerIsDefined()) {
     Caller.addFnAttr(Callee.getFnAttribute("null-pointer-is-valid"));
   }
diff --git a/hpvm/projects/llvm-cbe/build/CMakeCache.txt b/hpvm/projects/llvm-cbe/build/CMakeCache.txt
deleted file mode 100644
index 5d9ac640421f729cf532c6e4406548fe77085f49..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeCache.txt
+++ /dev/null
@@ -1,314 +0,0 @@
-# This is the CMakeCache file.
-# For build in directory: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build
-# It was generated by CMake: /usr/bin/cmake
-# You can edit this file to change values found and used by cmake.
-# If you do not want to change any of the values, simply exit the editor.
-# If you do want to change a value, simply edit, save, and exit the editor.
-# The syntax for the file is as follows:
-# KEY:TYPE=VALUE
-# KEY is the name of a variable in the cache.
-# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
-# VALUE is the current value for the KEY.
-
-########################
-# EXTERNAL cache entries
-########################
-
-//Path to a program.
-CMAKE_AR:FILEPATH=/usr/bin/ar
-
-//For backwards compatibility, what version of CMake commands and
-// syntax should this version of CMake try to support.
-CMAKE_BACKWARDS_COMPATIBILITY:STRING=2.4
-
-//Choose the type of build, options are: None(CMAKE_CXX_FLAGS or
-// CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.
-CMAKE_BUILD_TYPE:STRING=
-
-//Enable/Disable color output during build.
-CMAKE_COLOR_MAKEFILE:BOOL=ON
-
-//CXX compiler
-CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/c++
-
-//Flags used by the compiler during all build types.
-CMAKE_CXX_FLAGS:STRING=
-
-//Flags used by the compiler during debug builds.
-CMAKE_CXX_FLAGS_DEBUG:STRING=-g
-
-//Flags used by the compiler during release builds for minimum
-// size.
-CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
-
-//Flags used by the compiler during release builds.
-CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
-
-//Flags used by the compiler during release builds with debug info.
-CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
-
-//C compiler
-CMAKE_C_COMPILER:FILEPATH=/usr/bin/cc
-
-//Flags used by the compiler during all build types.
-CMAKE_C_FLAGS:STRING=
-
-//Flags used by the compiler during debug builds.
-CMAKE_C_FLAGS_DEBUG:STRING=-g
-
-//Flags used by the compiler during release builds for minimum
-// size.
-CMAKE_C_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
-
-//Flags used by the compiler during release builds.
-CMAKE_C_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
-
-//Flags used by the compiler during release builds with debug info.
-CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
-
-//Flags used by the linker.
-CMAKE_EXE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during debug builds.
-CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during release minsize builds.
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during release builds.
-CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during Release with Debug Info builds.
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Enable/Disable output of compile commands during generation.
-CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF
-
-//Install path prefix, prepended onto install directories.
-CMAKE_INSTALL_PREFIX:PATH=/usr/local
-
-//Path to a program.
-CMAKE_LINKER:FILEPATH=/usr/bin/ld
-
-//Path to a program.
-CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/make
-
-//Flags used by the linker during the creation of modules.
-CMAKE_MODULE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during debug builds.
-CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during release minsize builds.
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during release builds.
-CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during Release with Debug Info builds.
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_NM:FILEPATH=/usr/bin/nm
-
-//Path to a program.
-CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
-
-//Path to a program.
-CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
-
-//Value Computed by CMake
-CMAKE_PROJECT_NAME:STATIC=Project
-
-//Path to a program.
-CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
-
-//Flags used by the linker during the creation of dll's.
-CMAKE_SHARED_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during debug builds.
-CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during release minsize builds.
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during release builds.
-CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during Release with Debug Info builds.
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//If set, runtime paths are not added when installing shared libraries,
-// but are added when building.
-CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
-
-//If set, runtime paths are not added when using shared libraries.
-CMAKE_SKIP_RPATH:BOOL=NO
-
-//Flags used by the linker during the creation of static libraries.
-CMAKE_STATIC_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during debug builds.
-CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during release minsize builds.
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during release builds.
-CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during Release with Debug Info builds.
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_STRIP:FILEPATH=/usr/bin/strip
-
-//If this value is on, makefiles will be generated without the
-// .SILENT directive, and all commands will be echoed to the console
-// during the make.  This is useful for debugging only. With Visual
-// Studio IDE projects all commands are done without /nologo.
-CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
-
-//Single output directory for building all executables.
-EXECUTABLE_OUTPUT_PATH:PATH=
-
-//Single output directory for building all libraries.
-LIBRARY_OUTPUT_PATH:PATH=
-
-//Value Computed by CMake
-Project_BINARY_DIR:STATIC=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build
-
-//Value Computed by CMake
-Project_SOURCE_DIR:STATIC=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe
-
-
-########################
-# INTERNAL cache entries
-########################
-
-//ADVANCED property for variable: CMAKE_AR
-CMAKE_AR-ADVANCED:INTERNAL=1
-//This is the directory where this CMakeCache.txt was created
-CMAKE_CACHEFILE_DIR:INTERNAL=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build
-//Major version of cmake used to create the current loaded cache
-CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
-//Minor version of cmake used to create the current loaded cache
-CMAKE_CACHE_MINOR_VERSION:INTERNAL=5
-//Patch version of cmake used to create the current loaded cache
-CMAKE_CACHE_PATCH_VERSION:INTERNAL=1
-//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
-CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
-//Path to CMake executable.
-CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
-//Path to cpack program executable.
-CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
-//Path to ctest program executable.
-CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
-//ADVANCED property for variable: CMAKE_CXX_COMPILER
-CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS
-CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
-CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
-CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
-CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
-CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_COMPILER
-CMAKE_C_COMPILER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS
-CMAKE_C_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG
-CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL
-CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE
-CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO
-CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//Executable file format
-CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
-CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
-CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
-CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
-CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
-//Name of external makefile project generator.
-CMAKE_EXTRA_GENERATOR:INTERNAL=
-//Name of generator.
-CMAKE_GENERATOR:INTERNAL=Unix Makefiles
-//Name of generator platform.
-CMAKE_GENERATOR_PLATFORM:INTERNAL=
-//Name of generator toolset.
-CMAKE_GENERATOR_TOOLSET:INTERNAL=
-//Source directory with the top level CMakeLists.txt file for this
-// project
-CMAKE_HOME_DIRECTORY:INTERNAL=/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe
-//Install .so files without execute permission.
-CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1
-//ADVANCED property for variable: CMAKE_LINKER
-CMAKE_LINKER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
-CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
-CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
-CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
-CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_NM
-CMAKE_NM-ADVANCED:INTERNAL=1
-//number of local generators
-CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=4
-//ADVANCED property for variable: CMAKE_OBJCOPY
-CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_OBJDUMP
-CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_RANLIB
-CMAKE_RANLIB-ADVANCED:INTERNAL=1
-//Path to CMake installation.
-CMAKE_ROOT:INTERNAL=/usr/share/cmake-3.5
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
-CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
-CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
-CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
-CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_RPATH
-CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
-CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
-CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
-CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STRIP
-CMAKE_STRIP-ADVANCED:INTERNAL=1
-//uname command
-CMAKE_UNAME:INTERNAL=/bin/uname
-//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
-CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
-
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCCompiler.cmake b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCCompiler.cmake
deleted file mode 100644
index f40522e627a66ddca0a1b7c75b83836d5e12e77a..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCCompiler.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-set(CMAKE_C_COMPILER "/usr/bin/cc")
-set(CMAKE_C_COMPILER_ARG1 "")
-set(CMAKE_C_COMPILER_ID "GNU")
-set(CMAKE_C_COMPILER_VERSION "5.4.0")
-set(CMAKE_C_COMPILER_WRAPPER "")
-set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "11")
-set(CMAKE_C_COMPILE_FEATURES "c_function_prototypes;c_restrict;c_variadic_macros;c_static_assert")
-set(CMAKE_C90_COMPILE_FEATURES "c_function_prototypes")
-set(CMAKE_C99_COMPILE_FEATURES "c_restrict;c_variadic_macros")
-set(CMAKE_C11_COMPILE_FEATURES "c_static_assert")
-
-set(CMAKE_C_PLATFORM_ID "Linux")
-set(CMAKE_C_SIMULATE_ID "")
-set(CMAKE_C_SIMULATE_VERSION "")
-
-set(CMAKE_AR "/usr/bin/ar")
-set(CMAKE_RANLIB "/usr/bin/ranlib")
-set(CMAKE_LINKER "/usr/bin/ld")
-set(CMAKE_COMPILER_IS_GNUCC 1)
-set(CMAKE_C_COMPILER_LOADED 1)
-set(CMAKE_C_COMPILER_WORKS TRUE)
-set(CMAKE_C_ABI_COMPILED TRUE)
-set(CMAKE_COMPILER_IS_MINGW )
-set(CMAKE_COMPILER_IS_CYGWIN )
-if(CMAKE_COMPILER_IS_CYGWIN)
-  set(CYGWIN 1)
-  set(UNIX 1)
-endif()
-
-set(CMAKE_C_COMPILER_ENV_VAR "CC")
-
-if(CMAKE_COMPILER_IS_MINGW)
-  set(MINGW 1)
-endif()
-set(CMAKE_C_COMPILER_ID_RUN 1)
-set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m)
-set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
-set(CMAKE_C_LINKER_PREFERENCE 10)
-
-# Save compiler ABI information.
-set(CMAKE_C_SIZEOF_DATA_PTR "8")
-set(CMAKE_C_COMPILER_ABI "ELF")
-set(CMAKE_C_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-
-if(CMAKE_C_SIZEOF_DATA_PTR)
-  set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}")
-endif()
-
-if(CMAKE_C_COMPILER_ABI)
-  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}")
-endif()
-
-if(CMAKE_C_LIBRARY_ARCHITECTURE)
-  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-endif()
-
-set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "")
-if(CMAKE_C_CL_SHOWINCLUDES_PREFIX)
-  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}")
-endif()
-
-
-
-
-set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "c")
-set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib")
-set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCXXCompiler.cmake b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCXXCompiler.cmake
deleted file mode 100644
index 013ee9298fb861e7d0350d49a1fc08c0274b5e59..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeCXXCompiler.cmake
+++ /dev/null
@@ -1,68 +0,0 @@
-set(CMAKE_CXX_COMPILER "/usr/bin/c++")
-set(CMAKE_CXX_COMPILER_ARG1 "")
-set(CMAKE_CXX_COMPILER_ID "GNU")
-set(CMAKE_CXX_COMPILER_VERSION "5.4.0")
-set(CMAKE_CXX_COMPILER_WRAPPER "")
-set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "98")
-set(CMAKE_CXX_COMPILE_FEATURES "cxx_template_template_parameters;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
-set(CMAKE_CXX98_COMPILE_FEATURES "cxx_template_template_parameters")
-set(CMAKE_CXX11_COMPILE_FEATURES "cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
-set(CMAKE_CXX14_COMPILE_FEATURES "cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
-
-set(CMAKE_CXX_PLATFORM_ID "Linux")
-set(CMAKE_CXX_SIMULATE_ID "")
-set(CMAKE_CXX_SIMULATE_VERSION "")
-
-set(CMAKE_AR "/usr/bin/ar")
-set(CMAKE_RANLIB "/usr/bin/ranlib")
-set(CMAKE_LINKER "/usr/bin/ld")
-set(CMAKE_COMPILER_IS_GNUCXX 1)
-set(CMAKE_CXX_COMPILER_LOADED 1)
-set(CMAKE_CXX_COMPILER_WORKS TRUE)
-set(CMAKE_CXX_ABI_COMPILED TRUE)
-set(CMAKE_COMPILER_IS_MINGW )
-set(CMAKE_COMPILER_IS_CYGWIN )
-if(CMAKE_COMPILER_IS_CYGWIN)
-  set(CYGWIN 1)
-  set(UNIX 1)
-endif()
-
-set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
-
-if(CMAKE_COMPILER_IS_MINGW)
-  set(MINGW 1)
-endif()
-set(CMAKE_CXX_COMPILER_ID_RUN 1)
-set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
-set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;mm;CPP)
-set(CMAKE_CXX_LINKER_PREFERENCE 30)
-set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
-
-# Save compiler ABI information.
-set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
-set(CMAKE_CXX_COMPILER_ABI "ELF")
-set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-
-if(CMAKE_CXX_SIZEOF_DATA_PTR)
-  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
-endif()
-
-if(CMAKE_CXX_COMPILER_ABI)
-  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
-endif()
-
-if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
-  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-endif()
-
-set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
-if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
-  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
-endif()
-
-
-
-
-set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;c")
-set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib")
-set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_C.bin b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_C.bin
deleted file mode 100755
index 007976746bbc08577dee275193a151481f73ad7d..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_C.bin and /dev/null differ
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_CXX.bin b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_CXX.bin
deleted file mode 100755
index 9717f93a704a711c1635031e2c1da2d3efcb684d..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeDetermineCompilerABI_CXX.bin and /dev/null differ
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeSystem.cmake b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeSystem.cmake
deleted file mode 100644
index 1927fbd348850efae35e1e56d7276bc0413aecb2..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CMakeSystem.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-set(CMAKE_HOST_SYSTEM "Linux-4.15.0-66-generic")
-set(CMAKE_HOST_SYSTEM_NAME "Linux")
-set(CMAKE_HOST_SYSTEM_VERSION "4.15.0-66-generic")
-set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
-
-
-
-set(CMAKE_SYSTEM "Linux-4.15.0-66-generic")
-set(CMAKE_SYSTEM_NAME "Linux")
-set(CMAKE_SYSTEM_VERSION "4.15.0-66-generic")
-set(CMAKE_SYSTEM_PROCESSOR "x86_64")
-
-set(CMAKE_CROSSCOMPILING "FALSE")
-
-set(CMAKE_SYSTEM_LOADED 1)
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/CMakeCCompilerId.c b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/CMakeCCompilerId.c
deleted file mode 100644
index 570a15e994e4f10ca4a05b4451ea350fb942337f..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/CMakeCCompilerId.c
+++ /dev/null
@@ -1,544 +0,0 @@
-#ifdef __cplusplus
-# error "A C++ compiler has been selected for C."
-#endif
-
-#if defined(__18CXX)
-# define ID_VOID_MAIN
-#endif
-
-
-/* Version number components: V=Version, R=Revision, P=Patch
-   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
-
-#if defined(__INTEL_COMPILER) || defined(__ICC)
-# define COMPILER_ID "Intel"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-  /* __INTEL_COMPILER = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
-# if defined(__INTEL_COMPILER_UPDATE)
-#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
-# else
-#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
-# endif
-# if defined(__INTEL_COMPILER_BUILD_DATE)
-  /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
-#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
-# endif
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__PATHCC__)
-# define COMPILER_ID "PathScale"
-# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
-# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
-# if defined(__PATHCC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
-# endif
-
-#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
-# define COMPILER_ID "Embarcadero"
-# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
-# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
-# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
-
-#elif defined(__BORLANDC__)
-# define COMPILER_ID "Borland"
-  /* __BORLANDC__ = 0xVRR */
-# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
-# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
-
-#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
-# define COMPILER_ID "Watcom"
-   /* __WATCOMC__ = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__WATCOMC__)
-# define COMPILER_ID "OpenWatcom"
-   /* __WATCOMC__ = VVRP + 1100 */
-# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__SUNPRO_C)
-# define COMPILER_ID "SunPro"
-# if __SUNPRO_C >= 0x5100
-   /* __SUNPRO_C = 0xVRRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
-# else
-   /* __SUNPRO_CC = 0xVRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
-# endif
-
-#elif defined(__HP_cc)
-# define COMPILER_ID "HP"
-  /* __HP_cc = VVRRPP */
-# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000)
-# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__HP_cc     % 100)
-
-#elif defined(__DECC)
-# define COMPILER_ID "Compaq"
-  /* __DECC_VER = VVRRTPPPP */
-# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000)
-# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000  % 100)
-# define COMPILER_VERSION_PATCH DEC(__DECC_VER         % 10000)
-
-#elif defined(__IBMC__) && defined(__COMPILER_VER__)
-# define COMPILER_ID "zOS"
-  /* __IBMC__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
-
-#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
-# define COMPILER_ID "XL"
-  /* __IBMC__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
-
-#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
-# define COMPILER_ID "VisualAge"
-  /* __IBMC__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
-
-#elif defined(__PGI)
-# define COMPILER_ID "PGI"
-# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
-# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
-# if defined(__PGIC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
-# endif
-
-#elif defined(_CRAYC)
-# define COMPILER_ID "Cray"
-# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
-# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
-
-#elif defined(__TI_COMPILER_VERSION__)
-# define COMPILER_ID "TI"
-  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
-# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
-# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
-# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
-
-#elif defined(__FUJITSU) || defined(__FCC_VERSION) || defined(__fcc_version)
-# define COMPILER_ID "Fujitsu"
-
-#elif defined(__TINYC__)
-# define COMPILER_ID "TinyCC"
-
-#elif defined(__SCO_VERSION__)
-# define COMPILER_ID "SCO"
-
-#elif defined(__clang__) && defined(__apple_build_version__)
-# define COMPILER_ID "AppleClang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
-
-#elif defined(__clang__)
-# define COMPILER_ID "Clang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__GNUC__)
-# define COMPILER_ID "GNU"
-# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
-# if defined(__GNUC_MINOR__)
-#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif defined(_MSC_VER)
-# define COMPILER_ID "MSVC"
-  /* _MSC_VER = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
-# if defined(_MSC_FULL_VER)
-#  if _MSC_VER >= 1400
-    /* _MSC_FULL_VER = VVRRPPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
-#  else
-    /* _MSC_FULL_VER = VVRRPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
-#  endif
-# endif
-# if defined(_MSC_BUILD)
-#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
-# endif
-
-#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
-# define COMPILER_ID "ADSP"
-#if defined(__VISUALDSPVERSION__)
-  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
-# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
-# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
-# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
-#endif
-
-#elif defined(__IAR_SYSTEMS_ICC__ ) || defined(__IAR_SYSTEMS_ICC)
-# define COMPILER_ID "IAR"
-
-#elif defined(__ARMCC_VERSION)
-# define COMPILER_ID "ARMCC"
-#if __ARMCC_VERSION >= 1000000
-  /* __ARMCC_VERSION = VRRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
-#else
-  /* __ARMCC_VERSION = VRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
-#endif
-
-
-#elif defined(SDCC)
-# define COMPILER_ID "SDCC"
-  /* SDCC = VRP */
-#  define COMPILER_VERSION_MAJOR DEC(SDCC/100)
-#  define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10)
-#  define COMPILER_VERSION_PATCH DEC(SDCC    % 10)
-
-#elif defined(_SGI_COMPILER_VERSION) || defined(_COMPILER_VERSION)
-# define COMPILER_ID "MIPSpro"
-# if defined(_SGI_COMPILER_VERSION)
-  /* _SGI_COMPILER_VERSION = VRP */
-#  define COMPILER_VERSION_MAJOR DEC(_SGI_COMPILER_VERSION/100)
-#  define COMPILER_VERSION_MINOR DEC(_SGI_COMPILER_VERSION/10 % 10)
-#  define COMPILER_VERSION_PATCH DEC(_SGI_COMPILER_VERSION    % 10)
-# else
-  /* _COMPILER_VERSION = VRP */
-#  define COMPILER_VERSION_MAJOR DEC(_COMPILER_VERSION/100)
-#  define COMPILER_VERSION_MINOR DEC(_COMPILER_VERSION/10 % 10)
-#  define COMPILER_VERSION_PATCH DEC(_COMPILER_VERSION    % 10)
-# endif
-
-
-/* These compilers are either not known or too old to define an
-  identification macro.  Try to identify the platform and guess that
-  it is the native compiler.  */
-#elif defined(__sgi)
-# define COMPILER_ID "MIPSpro"
-
-#elif defined(__hpux) || defined(__hpua)
-# define COMPILER_ID "HP"
-
-#else /* unknown compiler */
-# define COMPILER_ID ""
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
-#ifdef SIMULATE_ID
-char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
-#endif
-
-#ifdef __QNXNTO__
-char const* qnxnto = "INFO" ":" "qnxnto[]";
-#endif
-
-#if defined(__CRAYXE) || defined(__CRAYXC)
-char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
-#endif
-
-#define STRINGIFY_HELPER(X) #X
-#define STRINGIFY(X) STRINGIFY_HELPER(X)
-
-/* Identify known platforms by name.  */
-#if defined(__linux) || defined(__linux__) || defined(linux)
-# define PLATFORM_ID "Linux"
-
-#elif defined(__CYGWIN__)
-# define PLATFORM_ID "Cygwin"
-
-#elif defined(__MINGW32__)
-# define PLATFORM_ID "MinGW"
-
-#elif defined(__APPLE__)
-# define PLATFORM_ID "Darwin"
-
-#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
-# define PLATFORM_ID "Windows"
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD)
-# define PLATFORM_ID "FreeBSD"
-
-#elif defined(__NetBSD__) || defined(__NetBSD)
-# define PLATFORM_ID "NetBSD"
-
-#elif defined(__OpenBSD__) || defined(__OPENBSD)
-# define PLATFORM_ID "OpenBSD"
-
-#elif defined(__sun) || defined(sun)
-# define PLATFORM_ID "SunOS"
-
-#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
-# define PLATFORM_ID "AIX"
-
-#elif defined(__sgi) || defined(__sgi__) || defined(_SGI)
-# define PLATFORM_ID "IRIX"
-
-#elif defined(__hpux) || defined(__hpux__)
-# define PLATFORM_ID "HP-UX"
-
-#elif defined(__HAIKU__)
-# define PLATFORM_ID "Haiku"
-
-#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
-# define PLATFORM_ID "BeOS"
-
-#elif defined(__QNX__) || defined(__QNXNTO__)
-# define PLATFORM_ID "QNX"
-
-#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
-# define PLATFORM_ID "Tru64"
-
-#elif defined(__riscos) || defined(__riscos__)
-# define PLATFORM_ID "RISCos"
-
-#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
-# define PLATFORM_ID "SINIX"
-
-#elif defined(__UNIX_SV__)
-# define PLATFORM_ID "UNIX_SV"
-
-#elif defined(__bsdos__)
-# define PLATFORM_ID "BSDOS"
-
-#elif defined(_MPRAS) || defined(MPRAS)
-# define PLATFORM_ID "MP-RAS"
-
-#elif defined(__osf) || defined(__osf__)
-# define PLATFORM_ID "OSF1"
-
-#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
-# define PLATFORM_ID "SCO_SV"
-
-#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
-# define PLATFORM_ID "ULTRIX"
-
-#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
-# define PLATFORM_ID "Xenix"
-
-#elif defined(__WATCOMC__)
-# if defined(__LINUX__)
-#  define PLATFORM_ID "Linux"
-
-# elif defined(__DOS__)
-#  define PLATFORM_ID "DOS"
-
-# elif defined(__OS2__)
-#  define PLATFORM_ID "OS2"
-
-# elif defined(__WINDOWS__)
-#  define PLATFORM_ID "Windows3x"
-
-# else /* unknown platform */
-#  define PLATFORM_ID ""
-# endif
-
-#else /* unknown platform */
-# define PLATFORM_ID ""
-
-#endif
-
-/* For windows compilers MSVC and Intel we can determine
-   the architecture of the compiler being used.  This is because
-   the compilers do not have flags that can change the architecture,
-   but rather depend on which compiler is being used
-*/
-#if defined(_WIN32) && defined(_MSC_VER)
-# if defined(_M_IA64)
-#  define ARCHITECTURE_ID "IA64"
-
-# elif defined(_M_X64) || defined(_M_AMD64)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# elif defined(_M_ARM)
-#  if _M_ARM == 4
-#   define ARCHITECTURE_ID "ARMV4I"
-#  elif _M_ARM == 5
-#   define ARCHITECTURE_ID "ARMV5I"
-#  else
-#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
-#  endif
-
-# elif defined(_M_MIPS)
-#  define ARCHITECTURE_ID "MIPS"
-
-# elif defined(_M_SH)
-#  define ARCHITECTURE_ID "SHx"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__WATCOMC__)
-# if defined(_M_I86)
-#  define ARCHITECTURE_ID "I86"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#else
-#  define ARCHITECTURE_ID ""
-#endif
-
-/* Convert integer to decimal digit literals.  */
-#define DEC(n)                   \
-  ('0' + (((n) / 10000000)%10)), \
-  ('0' + (((n) / 1000000)%10)),  \
-  ('0' + (((n) / 100000)%10)),   \
-  ('0' + (((n) / 10000)%10)),    \
-  ('0' + (((n) / 1000)%10)),     \
-  ('0' + (((n) / 100)%10)),      \
-  ('0' + (((n) / 10)%10)),       \
-  ('0' +  ((n) % 10))
-
-/* Convert integer to hex digit literals.  */
-#define HEX(n)             \
-  ('0' + ((n)>>28 & 0xF)), \
-  ('0' + ((n)>>24 & 0xF)), \
-  ('0' + ((n)>>20 & 0xF)), \
-  ('0' + ((n)>>16 & 0xF)), \
-  ('0' + ((n)>>12 & 0xF)), \
-  ('0' + ((n)>>8  & 0xF)), \
-  ('0' + ((n)>>4  & 0xF)), \
-  ('0' + ((n)     & 0xF))
-
-/* Construct a string literal encoding the version number components. */
-#ifdef COMPILER_VERSION_MAJOR
-char const info_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
-  COMPILER_VERSION_MAJOR,
-# ifdef COMPILER_VERSION_MINOR
-  '.', COMPILER_VERSION_MINOR,
-#  ifdef COMPILER_VERSION_PATCH
-   '.', COMPILER_VERSION_PATCH,
-#   ifdef COMPILER_VERSION_TWEAK
-    '.', COMPILER_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct a string literal encoding the version number components. */
-#ifdef SIMULATE_VERSION_MAJOR
-char const info_simulate_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
-  SIMULATE_VERSION_MAJOR,
-# ifdef SIMULATE_VERSION_MINOR
-  '.', SIMULATE_VERSION_MINOR,
-#  ifdef SIMULATE_VERSION_PATCH
-   '.', SIMULATE_VERSION_PATCH,
-#   ifdef SIMULATE_VERSION_TWEAK
-    '.', SIMULATE_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
-char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
-
-
-
-
-const char* info_language_dialect_default = "INFO" ":" "dialect_default["
-#if !defined(__STDC_VERSION__)
-  "90"
-#elif __STDC_VERSION__ >= 201000L
-  "11"
-#elif __STDC_VERSION__ >= 199901L
-  "99"
-#else
-#endif
-"]";
-
-/*--------------------------------------------------------------------------*/
-
-#ifdef ID_VOID_MAIN
-void main() {}
-#else
-int main(int argc, char* argv[])
-{
-  int require = 0;
-  require += info_compiler[argc];
-  require += info_platform[argc];
-  require += info_arch[argc];
-#ifdef COMPILER_VERSION_MAJOR
-  require += info_version[argc];
-#endif
-#ifdef SIMULATE_ID
-  require += info_simulate[argc];
-#endif
-#ifdef SIMULATE_VERSION_MAJOR
-  require += info_simulate_version[argc];
-#endif
-#if defined(__CRAYXE) || defined(__CRAYXC)
-  require += info_cray[argc];
-#endif
-  require += info_language_dialect_default[argc];
-  (void)argv;
-  return require;
-}
-#endif
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out
deleted file mode 100755
index 3e779cddc6621457f3aa2ca25bab3125c9c419b2..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out and /dev/null differ
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/CMakeCXXCompilerId.cpp b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
deleted file mode 100644
index e6d853637c6f7637dd8672b59612a9263a4d0244..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
+++ /dev/null
@@ -1,533 +0,0 @@
-/* This source file must have a .cpp extension so that all C++ compilers
-   recognize the extension without flags.  Borland does not know .cxx for
-   example.  */
-#ifndef __cplusplus
-# error "A C compiler has been selected for C++."
-#endif
-
-
-/* Version number components: V=Version, R=Revision, P=Patch
-   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
-
-#if defined(__COMO__)
-# define COMPILER_ID "Comeau"
-  /* __COMO_VERSION__ = VRR */
-# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100)
-# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100)
-
-#elif defined(__INTEL_COMPILER) || defined(__ICC)
-# define COMPILER_ID "Intel"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-  /* __INTEL_COMPILER = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
-# if defined(__INTEL_COMPILER_UPDATE)
-#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
-# else
-#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
-# endif
-# if defined(__INTEL_COMPILER_BUILD_DATE)
-  /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
-#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
-# endif
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__PATHCC__)
-# define COMPILER_ID "PathScale"
-# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
-# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
-# if defined(__PATHCC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
-# endif
-
-#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
-# define COMPILER_ID "Embarcadero"
-# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
-# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
-# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
-
-#elif defined(__BORLANDC__)
-# define COMPILER_ID "Borland"
-  /* __BORLANDC__ = 0xVRR */
-# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
-# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
-
-#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
-# define COMPILER_ID "Watcom"
-   /* __WATCOMC__ = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__WATCOMC__)
-# define COMPILER_ID "OpenWatcom"
-   /* __WATCOMC__ = VVRP + 1100 */
-# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__SUNPRO_CC)
-# define COMPILER_ID "SunPro"
-# if __SUNPRO_CC >= 0x5100
-   /* __SUNPRO_CC = 0xVRRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# else
-   /* __SUNPRO_CC = 0xVRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# endif
-
-#elif defined(__HP_aCC)
-# define COMPILER_ID "HP"
-  /* __HP_aCC = VVRRPP */
-# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
-# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
-
-#elif defined(__DECCXX)
-# define COMPILER_ID "Compaq"
-  /* __DECCXX_VER = VVRRTPPPP */
-# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
-# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
-# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
-
-#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
-# define COMPILER_ID "zOS"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
-# define COMPILER_ID "XL"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
-# define COMPILER_ID "VisualAge"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__PGI)
-# define COMPILER_ID "PGI"
-# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
-# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
-# if defined(__PGIC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
-# endif
-
-#elif defined(_CRAYC)
-# define COMPILER_ID "Cray"
-# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
-# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
-
-#elif defined(__TI_COMPILER_VERSION__)
-# define COMPILER_ID "TI"
-  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
-# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
-# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
-# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
-
-#elif defined(__FUJITSU) || defined(__FCC_VERSION) || defined(__fcc_version)
-# define COMPILER_ID "Fujitsu"
-
-#elif defined(__SCO_VERSION__)
-# define COMPILER_ID "SCO"
-
-#elif defined(__clang__) && defined(__apple_build_version__)
-# define COMPILER_ID "AppleClang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
-
-#elif defined(__clang__)
-# define COMPILER_ID "Clang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__GNUC__)
-# define COMPILER_ID "GNU"
-# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
-# if defined(__GNUC_MINOR__)
-#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif defined(_MSC_VER)
-# define COMPILER_ID "MSVC"
-  /* _MSC_VER = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
-# if defined(_MSC_FULL_VER)
-#  if _MSC_VER >= 1400
-    /* _MSC_FULL_VER = VVRRPPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
-#  else
-    /* _MSC_FULL_VER = VVRRPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
-#  endif
-# endif
-# if defined(_MSC_BUILD)
-#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
-# endif
-
-#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
-# define COMPILER_ID "ADSP"
-#if defined(__VISUALDSPVERSION__)
-  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
-# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
-# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
-# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
-#endif
-
-#elif defined(__IAR_SYSTEMS_ICC__ ) || defined(__IAR_SYSTEMS_ICC)
-# define COMPILER_ID "IAR"
-
-#elif defined(__ARMCC_VERSION)
-# define COMPILER_ID "ARMCC"
-#if __ARMCC_VERSION >= 1000000
-  /* __ARMCC_VERSION = VRRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
-#else
-  /* __ARMCC_VERSION = VRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
-#endif
-
-
-#elif defined(_SGI_COMPILER_VERSION) || defined(_COMPILER_VERSION)
-# define COMPILER_ID "MIPSpro"
-# if defined(_SGI_COMPILER_VERSION)
-  /* _SGI_COMPILER_VERSION = VRP */
-#  define COMPILER_VERSION_MAJOR DEC(_SGI_COMPILER_VERSION/100)
-#  define COMPILER_VERSION_MINOR DEC(_SGI_COMPILER_VERSION/10 % 10)
-#  define COMPILER_VERSION_PATCH DEC(_SGI_COMPILER_VERSION    % 10)
-# else
-  /* _COMPILER_VERSION = VRP */
-#  define COMPILER_VERSION_MAJOR DEC(_COMPILER_VERSION/100)
-#  define COMPILER_VERSION_MINOR DEC(_COMPILER_VERSION/10 % 10)
-#  define COMPILER_VERSION_PATCH DEC(_COMPILER_VERSION    % 10)
-# endif
-
-
-/* These compilers are either not known or too old to define an
-  identification macro.  Try to identify the platform and guess that
-  it is the native compiler.  */
-#elif defined(__sgi)
-# define COMPILER_ID "MIPSpro"
-
-#elif defined(__hpux) || defined(__hpua)
-# define COMPILER_ID "HP"
-
-#else /* unknown compiler */
-# define COMPILER_ID ""
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
-#ifdef SIMULATE_ID
-char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
-#endif
-
-#ifdef __QNXNTO__
-char const* qnxnto = "INFO" ":" "qnxnto[]";
-#endif
-
-#if defined(__CRAYXE) || defined(__CRAYXC)
-char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
-#endif
-
-#define STRINGIFY_HELPER(X) #X
-#define STRINGIFY(X) STRINGIFY_HELPER(X)
-
-/* Identify known platforms by name.  */
-#if defined(__linux) || defined(__linux__) || defined(linux)
-# define PLATFORM_ID "Linux"
-
-#elif defined(__CYGWIN__)
-# define PLATFORM_ID "Cygwin"
-
-#elif defined(__MINGW32__)
-# define PLATFORM_ID "MinGW"
-
-#elif defined(__APPLE__)
-# define PLATFORM_ID "Darwin"
-
-#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
-# define PLATFORM_ID "Windows"
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD)
-# define PLATFORM_ID "FreeBSD"
-
-#elif defined(__NetBSD__) || defined(__NetBSD)
-# define PLATFORM_ID "NetBSD"
-
-#elif defined(__OpenBSD__) || defined(__OPENBSD)
-# define PLATFORM_ID "OpenBSD"
-
-#elif defined(__sun) || defined(sun)
-# define PLATFORM_ID "SunOS"
-
-#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
-# define PLATFORM_ID "AIX"
-
-#elif defined(__sgi) || defined(__sgi__) || defined(_SGI)
-# define PLATFORM_ID "IRIX"
-
-#elif defined(__hpux) || defined(__hpux__)
-# define PLATFORM_ID "HP-UX"
-
-#elif defined(__HAIKU__)
-# define PLATFORM_ID "Haiku"
-
-#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
-# define PLATFORM_ID "BeOS"
-
-#elif defined(__QNX__) || defined(__QNXNTO__)
-# define PLATFORM_ID "QNX"
-
-#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
-# define PLATFORM_ID "Tru64"
-
-#elif defined(__riscos) || defined(__riscos__)
-# define PLATFORM_ID "RISCos"
-
-#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
-# define PLATFORM_ID "SINIX"
-
-#elif defined(__UNIX_SV__)
-# define PLATFORM_ID "UNIX_SV"
-
-#elif defined(__bsdos__)
-# define PLATFORM_ID "BSDOS"
-
-#elif defined(_MPRAS) || defined(MPRAS)
-# define PLATFORM_ID "MP-RAS"
-
-#elif defined(__osf) || defined(__osf__)
-# define PLATFORM_ID "OSF1"
-
-#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
-# define PLATFORM_ID "SCO_SV"
-
-#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
-# define PLATFORM_ID "ULTRIX"
-
-#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
-# define PLATFORM_ID "Xenix"
-
-#elif defined(__WATCOMC__)
-# if defined(__LINUX__)
-#  define PLATFORM_ID "Linux"
-
-# elif defined(__DOS__)
-#  define PLATFORM_ID "DOS"
-
-# elif defined(__OS2__)
-#  define PLATFORM_ID "OS2"
-
-# elif defined(__WINDOWS__)
-#  define PLATFORM_ID "Windows3x"
-
-# else /* unknown platform */
-#  define PLATFORM_ID ""
-# endif
-
-#else /* unknown platform */
-# define PLATFORM_ID ""
-
-#endif
-
-/* For windows compilers MSVC and Intel we can determine
-   the architecture of the compiler being used.  This is because
-   the compilers do not have flags that can change the architecture,
-   but rather depend on which compiler is being used
-*/
-#if defined(_WIN32) && defined(_MSC_VER)
-# if defined(_M_IA64)
-#  define ARCHITECTURE_ID "IA64"
-
-# elif defined(_M_X64) || defined(_M_AMD64)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# elif defined(_M_ARM)
-#  if _M_ARM == 4
-#   define ARCHITECTURE_ID "ARMV4I"
-#  elif _M_ARM == 5
-#   define ARCHITECTURE_ID "ARMV5I"
-#  else
-#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
-#  endif
-
-# elif defined(_M_MIPS)
-#  define ARCHITECTURE_ID "MIPS"
-
-# elif defined(_M_SH)
-#  define ARCHITECTURE_ID "SHx"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__WATCOMC__)
-# if defined(_M_I86)
-#  define ARCHITECTURE_ID "I86"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#else
-#  define ARCHITECTURE_ID ""
-#endif
-
-/* Convert integer to decimal digit literals.  */
-#define DEC(n)                   \
-  ('0' + (((n) / 10000000)%10)), \
-  ('0' + (((n) / 1000000)%10)),  \
-  ('0' + (((n) / 100000)%10)),   \
-  ('0' + (((n) / 10000)%10)),    \
-  ('0' + (((n) / 1000)%10)),     \
-  ('0' + (((n) / 100)%10)),      \
-  ('0' + (((n) / 10)%10)),       \
-  ('0' +  ((n) % 10))
-
-/* Convert integer to hex digit literals.  */
-#define HEX(n)             \
-  ('0' + ((n)>>28 & 0xF)), \
-  ('0' + ((n)>>24 & 0xF)), \
-  ('0' + ((n)>>20 & 0xF)), \
-  ('0' + ((n)>>16 & 0xF)), \
-  ('0' + ((n)>>12 & 0xF)), \
-  ('0' + ((n)>>8  & 0xF)), \
-  ('0' + ((n)>>4  & 0xF)), \
-  ('0' + ((n)     & 0xF))
-
-/* Construct a string literal encoding the version number components. */
-#ifdef COMPILER_VERSION_MAJOR
-char const info_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
-  COMPILER_VERSION_MAJOR,
-# ifdef COMPILER_VERSION_MINOR
-  '.', COMPILER_VERSION_MINOR,
-#  ifdef COMPILER_VERSION_PATCH
-   '.', COMPILER_VERSION_PATCH,
-#   ifdef COMPILER_VERSION_TWEAK
-    '.', COMPILER_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct a string literal encoding the version number components. */
-#ifdef SIMULATE_VERSION_MAJOR
-char const info_simulate_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
-  SIMULATE_VERSION_MAJOR,
-# ifdef SIMULATE_VERSION_MINOR
-  '.', SIMULATE_VERSION_MINOR,
-#  ifdef SIMULATE_VERSION_PATCH
-   '.', SIMULATE_VERSION_PATCH,
-#   ifdef SIMULATE_VERSION_TWEAK
-    '.', SIMULATE_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
-char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
-
-
-
-
-const char* info_language_dialect_default = "INFO" ":" "dialect_default["
-#if __cplusplus >= 201402L
-  "14"
-#elif __cplusplus >= 201103L
-  "11"
-#else
-  "98"
-#endif
-"]";
-
-/*--------------------------------------------------------------------------*/
-
-int main(int argc, char* argv[])
-{
-  int require = 0;
-  require += info_compiler[argc];
-  require += info_platform[argc];
-#ifdef COMPILER_VERSION_MAJOR
-  require += info_version[argc];
-#endif
-#ifdef SIMULATE_ID
-  require += info_simulate[argc];
-#endif
-#ifdef SIMULATE_VERSION_MAJOR
-  require += info_simulate_version[argc];
-#endif
-#if defined(__CRAYXE) || defined(__CRAYXC)
-  require += info_cray[argc];
-#endif
-  require += info_language_dialect_default[argc];
-  (void)argv;
-  return require;
-}
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out b/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out
deleted file mode 100755
index 8f3f2abf609f4888e5f8ff9281af2657a5e2094d..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out and /dev/null differ
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/CMakeOutput.log b/hpvm/projects/llvm-cbe/build/CMakeFiles/CMakeOutput.log
deleted file mode 100644
index 34bcb9577d9dad1ca8cb6c1f22529e1cb191cbe6..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/CMakeOutput.log
+++ /dev/null
@@ -1,554 +0,0 @@
-The system is: Linux - 4.15.0-66-generic - x86_64
-Compiling the C compiler identification source file "CMakeCCompilerId.c" succeeded.
-Compiler: /usr/bin/cc 
-Build flags: 
-Id flags: 
-
-The output was:
-0
-
-
-Compilation of the C compiler identification source "CMakeCCompilerId.c" produced "a.out"
-
-The C compiler identification is GNU, found in "/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdC/a.out"
-
-Compiling the CXX compiler identification source file "CMakeCXXCompilerId.cpp" succeeded.
-Compiler: /usr/bin/c++ 
-Build flags: 
-Id flags: 
-
-The output was:
-0
-
-
-Compilation of the CXX compiler identification source "CMakeCXXCompilerId.cpp" produced "a.out"
-
-The CXX compiler identification is GNU, found in "/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/3.5.1/CompilerIdCXX/a.out"
-
-Determining if the C compiler works passed with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_a1887/fast"
-/usr/bin/make -f CMakeFiles/cmTC_a1887.dir/build.make CMakeFiles/cmTC_a1887.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building C object CMakeFiles/cmTC_a1887.dir/testCCompiler.c.o
-/usr/bin/cc     -o CMakeFiles/cmTC_a1887.dir/testCCompiler.c.o   -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp/testCCompiler.c
-Linking C executable cmTC_a1887
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_a1887.dir/link.txt --verbose=1
-/usr/bin/cc       CMakeFiles/cmTC_a1887.dir/testCCompiler.c.o  -o cmTC_a1887 -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-Detecting C compiler ABI info compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_8b6ac/fast"
-/usr/bin/make -f CMakeFiles/cmTC_8b6ac.dir/build.make CMakeFiles/cmTC_8b6ac.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building C object CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o
-/usr/bin/cc     -o CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o   -c /usr/share/cmake-3.5/Modules/CMakeCCompilerABI.c
-Linking C executable cmTC_8b6ac
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_8b6ac.dir/link.txt --verbose=1
-/usr/bin/cc      -v CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o  -o cmTC_8b6ac -rdynamic  
-Using built-in specs.
-COLLECT_GCC=/usr/bin/cc
-COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper
-Target: x86_64-linux-gnu
-Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
-Thread model: posix
-gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) 
-COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/
-LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/
-COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_8b6ac' '-rdynamic' '-mtune=generic' '-march=x86-64'
- /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/cckmhPLv.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_8b6ac /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-Parsed C implicit link information from above output:
-  link line regex: [^( *|.*[/\])(ld|([^/\]+-)?ld|collect2)[^/\]*( |$)]
-  ignore line: [Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp]
-  ignore line: []
-  ignore line: [Run Build Command:"/usr/bin/make" "cmTC_8b6ac/fast"]
-  ignore line: [/usr/bin/make -f CMakeFiles/cmTC_8b6ac.dir/build.make CMakeFiles/cmTC_8b6ac.dir/build]
-  ignore line: [make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp']
-  ignore line: [Building C object CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o]
-  ignore line: [/usr/bin/cc     -o CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o   -c /usr/share/cmake-3.5/Modules/CMakeCCompilerABI.c]
-  ignore line: [Linking C executable cmTC_8b6ac]
-  ignore line: [/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_8b6ac.dir/link.txt --verbose=1]
-  ignore line: [/usr/bin/cc      -v CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o  -o cmTC_8b6ac -rdynamic  ]
-  ignore line: [Using built-in specs.]
-  ignore line: [COLLECT_GCC=/usr/bin/cc]
-  ignore line: [COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper]
-  ignore line: [Target: x86_64-linux-gnu]
-  ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu]
-  ignore line: [Thread model: posix]
-  ignore line: [gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) ]
-  ignore line: [COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/]
-  ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/]
-  ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_8b6ac' '-rdynamic' '-mtune=generic' '-march=x86-64']
-  link line: [ /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/cckmhPLv.res -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lgcc_s --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_8b6ac /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o -lgcc --as-needed -lgcc_s --no-as-needed -lc -lgcc --as-needed -lgcc_s --no-as-needed /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o]
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/collect2] ==> ignore
-    arg [-plugin] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so] ==> ignore
-    arg [-plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper] ==> ignore
-    arg [-plugin-opt=-fresolution=/tmp/cckmhPLv.res] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-    arg [-plugin-opt=-pass-through=-lc] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-    arg [--sysroot=/] ==> ignore
-    arg [--build-id] ==> ignore
-    arg [--eh-frame-hdr] ==> ignore
-    arg [-m] ==> ignore
-    arg [elf_x86_64] ==> ignore
-    arg [--hash-style=gnu] ==> ignore
-    arg [--as-needed] ==> ignore
-    arg [-export-dynamic] ==> ignore
-    arg [-dynamic-linker] ==> ignore
-    arg [/lib64/ld-linux-x86-64.so.2] ==> ignore
-    arg [-zrelro] ==> ignore
-    arg [-o] ==> ignore
-    arg [cmTC_8b6ac] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o] ==> ignore
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5]
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu]
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib]
-    arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu]
-    arg [-L/lib/../lib] ==> dir [/lib/../lib]
-    arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu]
-    arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib]
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..]
-    arg [CMakeFiles/cmTC_8b6ac.dir/CMakeCCompilerABI.c.o] ==> ignore
-    arg [-lgcc] ==> lib [gcc]
-    arg [--as-needed] ==> ignore
-    arg [-lgcc_s] ==> lib [gcc_s]
-    arg [--no-as-needed] ==> ignore
-    arg [-lc] ==> lib [c]
-    arg [-lgcc] ==> lib [gcc]
-    arg [--as-needed] ==> ignore
-    arg [-lgcc_s] ==> lib [gcc_s]
-    arg [--no-as-needed] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtend.o] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o] ==> ignore
-  remove lib [gcc]
-  remove lib [gcc_s]
-  remove lib [gcc]
-  remove lib [gcc_s]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5] ==> [/usr/lib/gcc/x86_64-linux-gnu/5]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> [/usr/lib]
-  collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu]
-  collapse library dir [/lib/../lib] ==> [/lib]
-  collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-  collapse library dir [/usr/lib/../lib] ==> [/usr/lib]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> [/usr/lib]
-  implicit libs: [c]
-  implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib]
-  implicit fwks: []
-
-
-
-
-Detecting C [-std=c11] compiler features compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_58d37/fast"
-/usr/bin/make -f CMakeFiles/cmTC_58d37.dir/build.make CMakeFiles/cmTC_58d37.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building C object CMakeFiles/cmTC_58d37.dir/feature_tests.c.o
-/usr/bin/cc    -std=c11 -o CMakeFiles/cmTC_58d37.dir/feature_tests.c.o   -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c
-Linking C executable cmTC_58d37
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_58d37.dir/link.txt --verbose=1
-/usr/bin/cc       CMakeFiles/cmTC_58d37.dir/feature_tests.c.o  -o cmTC_58d37 -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-    Feature record: C_FEATURE:1c_function_prototypes
-    Feature record: C_FEATURE:1c_restrict
-    Feature record: C_FEATURE:1c_static_assert
-    Feature record: C_FEATURE:1c_variadic_macros
-
-
-Detecting C [-std=c99] compiler features compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_47bc6/fast"
-/usr/bin/make -f CMakeFiles/cmTC_47bc6.dir/build.make CMakeFiles/cmTC_47bc6.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building C object CMakeFiles/cmTC_47bc6.dir/feature_tests.c.o
-/usr/bin/cc    -std=c99 -o CMakeFiles/cmTC_47bc6.dir/feature_tests.c.o   -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c
-Linking C executable cmTC_47bc6
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_47bc6.dir/link.txt --verbose=1
-/usr/bin/cc       CMakeFiles/cmTC_47bc6.dir/feature_tests.c.o  -o cmTC_47bc6 -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-    Feature record: C_FEATURE:1c_function_prototypes
-    Feature record: C_FEATURE:1c_restrict
-    Feature record: C_FEATURE:0c_static_assert
-    Feature record: C_FEATURE:1c_variadic_macros
-
-
-Detecting C [-std=c90] compiler features compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_bafe8/fast"
-/usr/bin/make -f CMakeFiles/cmTC_bafe8.dir/build.make CMakeFiles/cmTC_bafe8.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building C object CMakeFiles/cmTC_bafe8.dir/feature_tests.c.o
-/usr/bin/cc    -std=c90 -o CMakeFiles/cmTC_bafe8.dir/feature_tests.c.o   -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c
-Linking C executable cmTC_bafe8
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_bafe8.dir/link.txt --verbose=1
-/usr/bin/cc       CMakeFiles/cmTC_bafe8.dir/feature_tests.c.o  -o cmTC_bafe8 -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-    Feature record: C_FEATURE:1c_function_prototypes
-    Feature record: C_FEATURE:0c_restrict
-    Feature record: C_FEATURE:0c_static_assert
-    Feature record: C_FEATURE:0c_variadic_macros
-Determining if the CXX compiler works passed with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_a4db2/fast"
-/usr/bin/make -f CMakeFiles/cmTC_a4db2.dir/build.make CMakeFiles/cmTC_a4db2.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building CXX object CMakeFiles/cmTC_a4db2.dir/testCXXCompiler.cxx.o
-/usr/bin/c++      -o CMakeFiles/cmTC_a4db2.dir/testCXXCompiler.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp/testCXXCompiler.cxx
-Linking CXX executable cmTC_a4db2
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_a4db2.dir/link.txt --verbose=1
-/usr/bin/c++        CMakeFiles/cmTC_a4db2.dir/testCXXCompiler.cxx.o  -o cmTC_a4db2 -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-Detecting CXX compiler ABI info compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_9fb11/fast"
-/usr/bin/make -f CMakeFiles/cmTC_9fb11.dir/build.make CMakeFiles/cmTC_9fb11.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building CXX object CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o
-/usr/bin/c++      -o CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -c /usr/share/cmake-3.5/Modules/CMakeCXXCompilerABI.cpp
-Linking CXX executable cmTC_9fb11
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_9fb11.dir/link.txt --verbose=1
-/usr/bin/c++       -v CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o  -o cmTC_9fb11 -rdynamic  
-Using built-in specs.
-COLLECT_GCC=/usr/bin/c++
-COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper
-Target: x86_64-linux-gnu
-Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
-Thread model: posix
-gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) 
-COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/
-LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/
-COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_9fb11' '-rdynamic' '-shared-libgcc' '-mtune=generic' '-march=x86-64'
- /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/ccHuuX3E.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_9fb11 /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-Parsed CXX implicit link information from above output:
-  link line regex: [^( *|.*[/\])(ld|([^/\]+-)?ld|collect2)[^/\]*( |$)]
-  ignore line: [Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp]
-  ignore line: []
-  ignore line: [Run Build Command:"/usr/bin/make" "cmTC_9fb11/fast"]
-  ignore line: [/usr/bin/make -f CMakeFiles/cmTC_9fb11.dir/build.make CMakeFiles/cmTC_9fb11.dir/build]
-  ignore line: [make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp']
-  ignore line: [Building CXX object CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o]
-  ignore line: [/usr/bin/c++      -o CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -c /usr/share/cmake-3.5/Modules/CMakeCXXCompilerABI.cpp]
-  ignore line: [Linking CXX executable cmTC_9fb11]
-  ignore line: [/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_9fb11.dir/link.txt --verbose=1]
-  ignore line: [/usr/bin/c++       -v CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o  -o cmTC_9fb11 -rdynamic  ]
-  ignore line: [Using built-in specs.]
-  ignore line: [COLLECT_GCC=/usr/bin/c++]
-  ignore line: [COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper]
-  ignore line: [Target: x86_64-linux-gnu]
-  ignore line: [Configured with: ../src/configure -v --with-pkgversion='Ubuntu 5.4.0-6ubuntu1~16.04.11' --with-bugurl=file:///usr/share/doc/gcc-5/README.Bugs --enable-languages=c,ada,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-5 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --with-default-libstdcxx-abi=new --enable-gnu-unique-object --disable-vtable-verify --enable-libmpx --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-5-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-5-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-5-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --enable-multilib --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu]
-  ignore line: [Thread model: posix]
-  ignore line: [gcc version 5.4.0 20160609 (Ubuntu 5.4.0-6ubuntu1~16.04.11) ]
-  ignore line: [COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/]
-  ignore line: [LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/5/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/5/../../../:/lib/:/usr/lib/]
-  ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_9fb11' '-rdynamic' '-shared-libgcc' '-mtune=generic' '-march=x86-64']
-  link line: [ /usr/lib/gcc/x86_64-linux-gnu/5/collect2 -plugin /usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so -plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper -plugin-opt=-fresolution=/tmp/ccHuuX3E.res -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc -plugin-opt=-pass-through=-lc -plugin-opt=-pass-through=-lgcc_s -plugin-opt=-pass-through=-lgcc --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o cmTC_9fb11 /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/5 -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/5/../../.. CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/5/crtend.o /usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o]
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/collect2] ==> ignore
-    arg [-plugin] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/liblto_plugin.so] ==> ignore
-    arg [-plugin-opt=/usr/lib/gcc/x86_64-linux-gnu/5/lto-wrapper] ==> ignore
-    arg [-plugin-opt=-fresolution=/tmp/ccHuuX3E.res] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-    arg [-plugin-opt=-pass-through=-lc] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc_s] ==> ignore
-    arg [-plugin-opt=-pass-through=-lgcc] ==> ignore
-    arg [--sysroot=/] ==> ignore
-    arg [--build-id] ==> ignore
-    arg [--eh-frame-hdr] ==> ignore
-    arg [-m] ==> ignore
-    arg [elf_x86_64] ==> ignore
-    arg [--hash-style=gnu] ==> ignore
-    arg [--as-needed] ==> ignore
-    arg [-export-dynamic] ==> ignore
-    arg [-dynamic-linker] ==> ignore
-    arg [/lib64/ld-linux-x86-64.so.2] ==> ignore
-    arg [-zrelro] ==> ignore
-    arg [-o] ==> ignore
-    arg [cmTC_9fb11] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crt1.o] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crti.o] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtbegin.o] ==> ignore
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5]
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu]
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib]
-    arg [-L/lib/x86_64-linux-gnu] ==> dir [/lib/x86_64-linux-gnu]
-    arg [-L/lib/../lib] ==> dir [/lib/../lib]
-    arg [-L/usr/lib/x86_64-linux-gnu] ==> dir [/usr/lib/x86_64-linux-gnu]
-    arg [-L/usr/lib/../lib] ==> dir [/usr/lib/../lib]
-    arg [-L/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..]
-    arg [CMakeFiles/cmTC_9fb11.dir/CMakeCXXCompilerABI.cpp.o] ==> ignore
-    arg [-lstdc++] ==> lib [stdc++]
-    arg [-lm] ==> lib [m]
-    arg [-lgcc_s] ==> lib [gcc_s]
-    arg [-lgcc] ==> lib [gcc]
-    arg [-lc] ==> lib [c]
-    arg [-lgcc_s] ==> lib [gcc_s]
-    arg [-lgcc] ==> lib [gcc]
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/crtend.o] ==> ignore
-    arg [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu/crtn.o] ==> ignore
-  remove lib [gcc_s]
-  remove lib [gcc]
-  remove lib [gcc_s]
-  remove lib [gcc]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5] ==> [/usr/lib/gcc/x86_64-linux-gnu/5]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../../../lib] ==> [/usr/lib]
-  collapse library dir [/lib/x86_64-linux-gnu] ==> [/lib/x86_64-linux-gnu]
-  collapse library dir [/lib/../lib] ==> [/lib]
-  collapse library dir [/usr/lib/x86_64-linux-gnu] ==> [/usr/lib/x86_64-linux-gnu]
-  collapse library dir [/usr/lib/../lib] ==> [/usr/lib]
-  collapse library dir [/usr/lib/gcc/x86_64-linux-gnu/5/../../..] ==> [/usr/lib]
-  implicit libs: [stdc++;m;c]
-  implicit dirs: [/usr/lib/gcc/x86_64-linux-gnu/5;/usr/lib/x86_64-linux-gnu;/usr/lib;/lib/x86_64-linux-gnu;/lib]
-  implicit fwks: []
-
-
-
-
-Detecting CXX [-std=c++14] compiler features compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_72948/fast"
-/usr/bin/make -f CMakeFiles/cmTC_72948.dir/build.make CMakeFiles/cmTC_72948.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building CXX object CMakeFiles/cmTC_72948.dir/feature_tests.cxx.o
-/usr/bin/c++     -std=c++14 -o CMakeFiles/cmTC_72948.dir/feature_tests.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx
-Linking CXX executable cmTC_72948
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_72948.dir/link.txt --verbose=1
-/usr/bin/c++        CMakeFiles/cmTC_72948.dir/feature_tests.cxx.o  -o cmTC_72948 -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-    Feature record: CXX_FEATURE:1cxx_aggregate_default_initializers
-    Feature record: CXX_FEATURE:1cxx_alias_templates
-    Feature record: CXX_FEATURE:1cxx_alignas
-    Feature record: CXX_FEATURE:1cxx_alignof
-    Feature record: CXX_FEATURE:1cxx_attributes
-    Feature record: CXX_FEATURE:1cxx_attribute_deprecated
-    Feature record: CXX_FEATURE:1cxx_auto_type
-    Feature record: CXX_FEATURE:1cxx_binary_literals
-    Feature record: CXX_FEATURE:1cxx_constexpr
-    Feature record: CXX_FEATURE:1cxx_contextual_conversions
-    Feature record: CXX_FEATURE:1cxx_decltype
-    Feature record: CXX_FEATURE:1cxx_decltype_auto
-    Feature record: CXX_FEATURE:1cxx_decltype_incomplete_return_types
-    Feature record: CXX_FEATURE:1cxx_default_function_template_args
-    Feature record: CXX_FEATURE:1cxx_defaulted_functions
-    Feature record: CXX_FEATURE:1cxx_defaulted_move_initializers
-    Feature record: CXX_FEATURE:1cxx_delegating_constructors
-    Feature record: CXX_FEATURE:1cxx_deleted_functions
-    Feature record: CXX_FEATURE:1cxx_digit_separators
-    Feature record: CXX_FEATURE:1cxx_enum_forward_declarations
-    Feature record: CXX_FEATURE:1cxx_explicit_conversions
-    Feature record: CXX_FEATURE:1cxx_extended_friend_declarations
-    Feature record: CXX_FEATURE:1cxx_extern_templates
-    Feature record: CXX_FEATURE:1cxx_final
-    Feature record: CXX_FEATURE:1cxx_func_identifier
-    Feature record: CXX_FEATURE:1cxx_generalized_initializers
-    Feature record: CXX_FEATURE:1cxx_generic_lambdas
-    Feature record: CXX_FEATURE:1cxx_inheriting_constructors
-    Feature record: CXX_FEATURE:1cxx_inline_namespaces
-    Feature record: CXX_FEATURE:1cxx_lambdas
-    Feature record: CXX_FEATURE:1cxx_lambda_init_captures
-    Feature record: CXX_FEATURE:1cxx_local_type_template_args
-    Feature record: CXX_FEATURE:1cxx_long_long_type
-    Feature record: CXX_FEATURE:1cxx_noexcept
-    Feature record: CXX_FEATURE:1cxx_nonstatic_member_init
-    Feature record: CXX_FEATURE:1cxx_nullptr
-    Feature record: CXX_FEATURE:1cxx_override
-    Feature record: CXX_FEATURE:1cxx_range_for
-    Feature record: CXX_FEATURE:1cxx_raw_string_literals
-    Feature record: CXX_FEATURE:1cxx_reference_qualified_functions
-    Feature record: CXX_FEATURE:1cxx_relaxed_constexpr
-    Feature record: CXX_FEATURE:1cxx_return_type_deduction
-    Feature record: CXX_FEATURE:1cxx_right_angle_brackets
-    Feature record: CXX_FEATURE:1cxx_rvalue_references
-    Feature record: CXX_FEATURE:1cxx_sizeof_member
-    Feature record: CXX_FEATURE:1cxx_static_assert
-    Feature record: CXX_FEATURE:1cxx_strong_enums
-    Feature record: CXX_FEATURE:1cxx_template_template_parameters
-    Feature record: CXX_FEATURE:1cxx_thread_local
-    Feature record: CXX_FEATURE:1cxx_trailing_return_types
-    Feature record: CXX_FEATURE:1cxx_unicode_literals
-    Feature record: CXX_FEATURE:1cxx_uniform_initialization
-    Feature record: CXX_FEATURE:1cxx_unrestricted_unions
-    Feature record: CXX_FEATURE:1cxx_user_literals
-    Feature record: CXX_FEATURE:1cxx_variable_templates
-    Feature record: CXX_FEATURE:1cxx_variadic_macros
-    Feature record: CXX_FEATURE:1cxx_variadic_templates
-
-
-Detecting CXX [-std=c++11] compiler features compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_4b9af/fast"
-/usr/bin/make -f CMakeFiles/cmTC_4b9af.dir/build.make CMakeFiles/cmTC_4b9af.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building CXX object CMakeFiles/cmTC_4b9af.dir/feature_tests.cxx.o
-/usr/bin/c++     -std=c++11 -o CMakeFiles/cmTC_4b9af.dir/feature_tests.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx
-Linking CXX executable cmTC_4b9af
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_4b9af.dir/link.txt --verbose=1
-/usr/bin/c++        CMakeFiles/cmTC_4b9af.dir/feature_tests.cxx.o  -o cmTC_4b9af -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-    Feature record: CXX_FEATURE:0cxx_aggregate_default_initializers
-    Feature record: CXX_FEATURE:1cxx_alias_templates
-    Feature record: CXX_FEATURE:1cxx_alignas
-    Feature record: CXX_FEATURE:1cxx_alignof
-    Feature record: CXX_FEATURE:1cxx_attributes
-    Feature record: CXX_FEATURE:0cxx_attribute_deprecated
-    Feature record: CXX_FEATURE:1cxx_auto_type
-    Feature record: CXX_FEATURE:0cxx_binary_literals
-    Feature record: CXX_FEATURE:1cxx_constexpr
-    Feature record: CXX_FEATURE:0cxx_contextual_conversions
-    Feature record: CXX_FEATURE:1cxx_decltype
-    Feature record: CXX_FEATURE:0cxx_decltype_auto
-    Feature record: CXX_FEATURE:1cxx_decltype_incomplete_return_types
-    Feature record: CXX_FEATURE:1cxx_default_function_template_args
-    Feature record: CXX_FEATURE:1cxx_defaulted_functions
-    Feature record: CXX_FEATURE:1cxx_defaulted_move_initializers
-    Feature record: CXX_FEATURE:1cxx_delegating_constructors
-    Feature record: CXX_FEATURE:1cxx_deleted_functions
-    Feature record: CXX_FEATURE:0cxx_digit_separators
-    Feature record: CXX_FEATURE:1cxx_enum_forward_declarations
-    Feature record: CXX_FEATURE:1cxx_explicit_conversions
-    Feature record: CXX_FEATURE:1cxx_extended_friend_declarations
-    Feature record: CXX_FEATURE:1cxx_extern_templates
-    Feature record: CXX_FEATURE:1cxx_final
-    Feature record: CXX_FEATURE:1cxx_func_identifier
-    Feature record: CXX_FEATURE:1cxx_generalized_initializers
-    Feature record: CXX_FEATURE:0cxx_generic_lambdas
-    Feature record: CXX_FEATURE:1cxx_inheriting_constructors
-    Feature record: CXX_FEATURE:1cxx_inline_namespaces
-    Feature record: CXX_FEATURE:1cxx_lambdas
-    Feature record: CXX_FEATURE:0cxx_lambda_init_captures
-    Feature record: CXX_FEATURE:1cxx_local_type_template_args
-    Feature record: CXX_FEATURE:1cxx_long_long_type
-    Feature record: CXX_FEATURE:1cxx_noexcept
-    Feature record: CXX_FEATURE:1cxx_nonstatic_member_init
-    Feature record: CXX_FEATURE:1cxx_nullptr
-    Feature record: CXX_FEATURE:1cxx_override
-    Feature record: CXX_FEATURE:1cxx_range_for
-    Feature record: CXX_FEATURE:1cxx_raw_string_literals
-    Feature record: CXX_FEATURE:1cxx_reference_qualified_functions
-    Feature record: CXX_FEATURE:0cxx_relaxed_constexpr
-    Feature record: CXX_FEATURE:0cxx_return_type_deduction
-    Feature record: CXX_FEATURE:1cxx_right_angle_brackets
-    Feature record: CXX_FEATURE:1cxx_rvalue_references
-    Feature record: CXX_FEATURE:1cxx_sizeof_member
-    Feature record: CXX_FEATURE:1cxx_static_assert
-    Feature record: CXX_FEATURE:1cxx_strong_enums
-    Feature record: CXX_FEATURE:1cxx_template_template_parameters
-    Feature record: CXX_FEATURE:1cxx_thread_local
-    Feature record: CXX_FEATURE:1cxx_trailing_return_types
-    Feature record: CXX_FEATURE:1cxx_unicode_literals
-    Feature record: CXX_FEATURE:1cxx_uniform_initialization
-    Feature record: CXX_FEATURE:1cxx_unrestricted_unions
-    Feature record: CXX_FEATURE:1cxx_user_literals
-    Feature record: CXX_FEATURE:0cxx_variable_templates
-    Feature record: CXX_FEATURE:1cxx_variadic_macros
-    Feature record: CXX_FEATURE:1cxx_variadic_templates
-
-
-Detecting CXX [-std=c++98] compiler features compiled with the following output:
-Change Dir: /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp
-
-Run Build Command:"/usr/bin/make" "cmTC_1ceb0/fast"
-/usr/bin/make -f CMakeFiles/cmTC_1ceb0.dir/build.make CMakeFiles/cmTC_1ceb0.dir/build
-make[1]: Entering directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-Building CXX object CMakeFiles/cmTC_1ceb0.dir/feature_tests.cxx.o
-/usr/bin/c++     -std=c++98 -o CMakeFiles/cmTC_1ceb0.dir/feature_tests.cxx.o -c /home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx
-Linking CXX executable cmTC_1ceb0
-/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_1ceb0.dir/link.txt --verbose=1
-/usr/bin/c++        CMakeFiles/cmTC_1ceb0.dir/feature_tests.cxx.o  -o cmTC_1ceb0 -rdynamic 
-make[1]: Leaving directory '/home/hsharif3/Gitlab/hpvm-fpga/hpvm/llvm/projects/llvm-cbe/build/CMakeFiles/CMakeTmp'
-
-
-    Feature record: CXX_FEATURE:0cxx_aggregate_default_initializers
-    Feature record: CXX_FEATURE:0cxx_alias_templates
-    Feature record: CXX_FEATURE:0cxx_alignas
-    Feature record: CXX_FEATURE:0cxx_alignof
-    Feature record: CXX_FEATURE:0cxx_attributes
-    Feature record: CXX_FEATURE:0cxx_attribute_deprecated
-    Feature record: CXX_FEATURE:0cxx_auto_type
-    Feature record: CXX_FEATURE:0cxx_binary_literals
-    Feature record: CXX_FEATURE:0cxx_constexpr
-    Feature record: CXX_FEATURE:0cxx_contextual_conversions
-    Feature record: CXX_FEATURE:0cxx_decltype
-    Feature record: CXX_FEATURE:0cxx_decltype_auto
-    Feature record: CXX_FEATURE:0cxx_decltype_incomplete_return_types
-    Feature record: CXX_FEATURE:0cxx_default_function_template_args
-    Feature record: CXX_FEATURE:0cxx_defaulted_functions
-    Feature record: CXX_FEATURE:0cxx_defaulted_move_initializers
-    Feature record: CXX_FEATURE:0cxx_delegating_constructors
-    Feature record: CXX_FEATURE:0cxx_deleted_functions
-    Feature record: CXX_FEATURE:0cxx_digit_separators
-    Feature record: CXX_FEATURE:0cxx_enum_forward_declarations
-    Feature record: CXX_FEATURE:0cxx_explicit_conversions
-    Feature record: CXX_FEATURE:0cxx_extended_friend_declarations
-    Feature record: CXX_FEATURE:0cxx_extern_templates
-    Feature record: CXX_FEATURE:0cxx_final
-    Feature record: CXX_FEATURE:0cxx_func_identifier
-    Feature record: CXX_FEATURE:0cxx_generalized_initializers
-    Feature record: CXX_FEATURE:0cxx_generic_lambdas
-    Feature record: CXX_FEATURE:0cxx_inheriting_constructors
-    Feature record: CXX_FEATURE:0cxx_inline_namespaces
-    Feature record: CXX_FEATURE:0cxx_lambdas
-    Feature record: CXX_FEATURE:0cxx_lambda_init_captures
-    Feature record: CXX_FEATURE:0cxx_local_type_template_args
-    Feature record: CXX_FEATURE:0cxx_long_long_type
-    Feature record: CXX_FEATURE:0cxx_noexcept
-    Feature record: CXX_FEATURE:0cxx_nonstatic_member_init
-    Feature record: CXX_FEATURE:0cxx_nullptr
-    Feature record: CXX_FEATURE:0cxx_override
-    Feature record: CXX_FEATURE:0cxx_range_for
-    Feature record: CXX_FEATURE:0cxx_raw_string_literals
-    Feature record: CXX_FEATURE:0cxx_reference_qualified_functions
-    Feature record: CXX_FEATURE:0cxx_relaxed_constexpr
-    Feature record: CXX_FEATURE:0cxx_return_type_deduction
-    Feature record: CXX_FEATURE:0cxx_right_angle_brackets
-    Feature record: CXX_FEATURE:0cxx_rvalue_references
-    Feature record: CXX_FEATURE:0cxx_sizeof_member
-    Feature record: CXX_FEATURE:0cxx_static_assert
-    Feature record: CXX_FEATURE:0cxx_strong_enums
-    Feature record: CXX_FEATURE:1cxx_template_template_parameters
-    Feature record: CXX_FEATURE:0cxx_thread_local
-    Feature record: CXX_FEATURE:0cxx_trailing_return_types
-    Feature record: CXX_FEATURE:0cxx_unicode_literals
-    Feature record: CXX_FEATURE:0cxx_uniform_initialization
-    Feature record: CXX_FEATURE:0cxx_unrestricted_unions
-    Feature record: CXX_FEATURE:0cxx_user_literals
-    Feature record: CXX_FEATURE:0cxx_variable_templates
-    Feature record: CXX_FEATURE:0cxx_variadic_macros
-    Feature record: CXX_FEATURE:0cxx_variadic_templates
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/cmake.check_cache b/hpvm/projects/llvm-cbe/build/CMakeFiles/cmake.check_cache
deleted file mode 100644
index 3dccd731726d7faa8b29d8d7dba3b981a53ca497..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/cmake.check_cache
+++ /dev/null
@@ -1 +0,0 @@
-# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.bin b/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.bin
deleted file mode 100755
index 1b62f454d8a2b71fdf6dd528f88f1c018560a607..0000000000000000000000000000000000000000
Binary files a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.bin and /dev/null differ
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c b/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c
deleted file mode 100644
index 6590dded2342f3eebd9b81505327e84a488580e6..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.c
+++ /dev/null
@@ -1,34 +0,0 @@
-
-  const char features[] = {"\n"
-"C_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404
-"1"
-#else
-"0"
-#endif
-"c_function_prototypes\n"
-"C_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-"1"
-#else
-"0"
-#endif
-"c_restrict\n"
-"C_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201000L
-"1"
-#else
-"0"
-#endif
-"c_static_assert\n"
-"C_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
-"1"
-#else
-"0"
-#endif
-"c_variadic_macros\n"
-
-};
-
-int main(int argc, char** argv) { (void)argv; return features[argc]; }
diff --git a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx b/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx
deleted file mode 100644
index b93418c6ed69feaf1b5c2feb9592bbdb5a5f042c..0000000000000000000000000000000000000000
--- a/hpvm/projects/llvm-cbe/build/CMakeFiles/feature_tests.cxx
+++ /dev/null
@@ -1,405 +0,0 @@
-
-  const char features[] = {"\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L
-"1"
-#else
-"0"
-#endif
-"cxx_aggregate_default_initializers\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_alias_templates\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_alignas\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_alignof\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_attributes\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_attribute_deprecated\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_auto_type\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_binary_literals\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_constexpr\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_contextual_conversions\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_decltype\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_decltype_auto\n"
-"CXX_FEATURE:"
-#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 40801) && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_decltype_incomplete_return_types\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_default_function_template_args\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_defaulted_functions\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_defaulted_move_initializers\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_delegating_constructors\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_deleted_functions\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_digit_separators\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_enum_forward_declarations\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_explicit_conversions\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_extended_friend_declarations\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_extern_templates\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_final\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_func_identifier\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_generalized_initializers\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_generic_lambdas\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_inheriting_constructors\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_inline_namespaces\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_lambdas\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_lambda_init_captures\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_local_type_template_args\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_long_long_type\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_noexcept\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_nonstatic_member_init\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_nullptr\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_override\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_range_for\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_raw_string_literals\n"
-"CXX_FEATURE:"
-#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 40801) && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_reference_qualified_functions\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L
-"1"
-#else
-"0"
-#endif
-"cxx_relaxed_constexpr\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_return_type_deduction\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_right_angle_brackets\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_rvalue_references\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_sizeof_member\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_static_assert\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_strong_enums\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && __cplusplus
-"1"
-#else
-"0"
-#endif
-"cxx_template_template_parameters\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_thread_local\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_trailing_return_types\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_unicode_literals\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_uniform_initialization\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_unrestricted_unions\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
-"1"
-#else
-"0"
-#endif
-"cxx_user_literals\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L
-"1"
-#else
-"0"
-#endif
-"cxx_variable_templates\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_variadic_macros\n"
-"CXX_FEATURE:"
-#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
-"1"
-#else
-"0"
-#endif
-"cxx_variadic_templates\n"
-
-};
-
-int main(int argc, char** argv) { (void)argv; return features[argc]; }
diff --git a/hpvm/projects/llvm-cbe/include/sample.h b/hpvm/projects/llvm-cbe/include/sample.h
index b3ce9ce2928297fb61db666d865de81542743da3..1d2545fb05c7bc392cd19575aed4f48062366631 100644
--- a/hpvm/projects/llvm-cbe/include/sample.h
+++ b/hpvm/projects/llvm-cbe/include/sample.h
@@ -4,5 +4,4 @@
  *      This is a sample header file that is global to the entire project.
  *      It is located here so that everyone will find it.
  */
-extern int compute_sample (int a);
-
+extern int compute_sample(int a);
diff --git a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp
index cb4d43311ab6754adf270769e56b3dd210a90163..a5fddb967dbd96befef3af7d01fe6c42fd16462c 100644
--- a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp
+++ b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.cpp
@@ -14,14 +14,14 @@
 
 #include "CBackend.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Config/config.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/Host.h"
-#include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/Config/config.h"
 #include "llvm/Transforms/Utils.h"
 
 #include <algorithm>
@@ -29,14 +29,13 @@
 
 #include <iostream>
 
-
 //#include "PHINodePass.h"
 
-//Jackson Korba 9/29/14
+// Jackson Korba 9/29/14
 #ifndef DEBUG_TYPE
 #define DEBUG_TYPE ""
 #endif
-//End Modification
+// End Modification
 
 // Some ms header decided to define setjmp as _setjmp, undo this for this file
 // since we don't need it
@@ -52,7 +51,8 @@ extern "C" void LLVMInitializeCBackendTarget() {
 
 char CWriter::ID = 0;
 
-// extra (invalid) Ops tags for tracking unary ops as a special case of the available binary ops
+// extra (invalid) Ops tags for tracking unary ops as a special case of the
+// available binary ops
 enum UnaryOps {
   BinaryNeg = Instruction::OtherOpsEnd + 1,
   BinaryNot,
@@ -61,19 +61,16 @@ enum UnaryOps {
 static bool isEmptyType(Type *Ty) {
   if (StructType *STy = dyn_cast<StructType>(Ty))
     return STy->getNumElements() == 0 ||
-      std::all_of(STy->element_begin(), STy->element_end(), [](Type *T){ return isEmptyType(T); });
+           std::all_of(STy->element_begin(), STy->element_end(),
+                       [](Type *T) { return isEmptyType(T); });
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return VTy->getNumElements() == 0 ||
-      isEmptyType(VTy->getElementType());
+    return VTy->getNumElements() == 0 || isEmptyType(VTy->getElementType());
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
-    return ATy->getNumElements() == 0 ||
-      isEmptyType(ATy->getElementType());
+    return ATy->getNumElements() == 0 || isEmptyType(ATy->getElementType());
   return Ty->isVoidTy();
 }
 
-bool CWriter::isEmptyType(Type *Ty) const {
-  return ::isEmptyType(Ty);
-}
+bool CWriter::isEmptyType(Type *Ty) const { return ::isEmptyType(Ty); }
 
 /// isAddressExposed - Return true if the specified value's name needs to
 /// have its address taken in order to get a C value of the correct type.
@@ -98,19 +95,19 @@ bool CWriter::isInlinableInst(Instruction &I) const {
   if (isa<GetElementPtrInst>(I)) {
     for (User *U : I.users()) {
       if (!(isa<LoadInst>(U) || isa<StoreInst>(U))) {
-        //DEBUG(errs() << "GEP user not a Load/Store!\n");
+        // DEBUG(errs() << "GEP user not a Load/Store!\n");
         return false;
       }
     }
-    //DEBUG(errs() << "All users of GEP are loads/stores, mark it inlinable!\n");
+    // DEBUG(errs() << "All users of GEP are loads/stores, mark it
+    // inlinable!\n");
     return true;
   }
   // Must be an expression, must be used exactly once.  If it is dead, we
   // emit it inline where it would go.
-  if (isEmptyType(I.getType()) || !I.hasOneUse() ||
-      I.isTerminator() || isa<CallInst>(I) || isa<PHINode>(I) ||
-      isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
-      isa<InsertValueInst>(I))
+  if (isEmptyType(I.getType()) || !I.hasOneUse() || I.isTerminator() ||
+      isa<CallInst>(I) || isa<PHINode>(I) || isa<LoadInst>(I) ||
+      isa<VAArgInst>(I) || isa<InsertElementInst>(I) || isa<InsertValueInst>(I))
     // Don't inline a load across a store or other bad things!
     return false;
 
@@ -130,19 +127,20 @@ bool CWriter::isInlinableInst(Instruction &I) const {
 // generate significantly better code than to emit alloca calls directly.
 //
 AllocaInst *CWriter::isDirectAlloca(Value *V) const {
-  //DEBUG(errs() << "Checking if " << *V << " is a direct alloca!\n");
+  // DEBUG(errs() << "Checking if " << *V << " is a direct alloca!\n");
   AllocaInst *AI = dyn_cast<AllocaInst>(V);
-  if (!AI) return 0;
+  if (!AI)
+    return 0;
   // Modification to inline fixed size array alloca!
   if (AI->isArrayAllocation())
-    return AI;   // FIXME: we can also inline fixed size array allocas!
+    return AI; // FIXME: we can also inline fixed size array allocas!
   if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
     return 0;
   return AI;
 }
 
 // isInlineAsm - Check if the instruction is a call to an inline asm chunk.
-bool CWriter::isInlineAsm(Instruction& I) const {
+bool CWriter::isInlineAsm(Instruction &I) const {
   if (CallInst *CI = dyn_cast<CallInst>(&I))
     return isa<InlineAsm>(CI->getCalledValue());
   return false;
@@ -160,19 +158,20 @@ bool CWriter::runOnFunction(Function &F) {
   PDT = &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
   // Adding Scalar Evolution Pass for loop induction variable
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-  //Adding Dominator Tree Pass
+  // Adding Dominator Tree Pass
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   // Adding Assumption Cache
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   // Adding IVUsers Pass for loop recongnition
   //  IU = &getAnalysis<IVUsersWrapperPass>().getIU();
 
-  BasicBlock* entry = &(F.getEntryBlock());
-  for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry); BI!=BE; ++BI) { 
+  BasicBlock *entry = &(F.getEntryBlock());
+  for (df_iterator<BasicBlock *> BI = df_begin(entry), BE = df_end(entry);
+       BI != BE; ++BI) {
     BasicBlock *BB = *BI;
     if (Loop *L = LI->getLoopFor(&*BB)) {
-      if(simplifyLoop(L, DT, LI, SE, AC, nullptr, /*true*/false)) {
-        //DEBUG(errs() << "Simplified loop!\n" << *L << "\n");
+      if (simplifyLoop(L, DT, LI, SE, AC, nullptr, /*true*/ false)) {
+        // DEBUG(errs() << "Simplified loop!\n" << *L << "\n");
       }
     }
   }
@@ -197,15 +196,15 @@ static std::string CBEMangle(const std::string &S) {
       Result += S[i];
     } else {
       Result += '_';
-      Result += 'A'+(S[i]&15);
-      Result += 'A'+((S[i]>>4)&15);
+      Result += 'A' + (S[i] & 15);
+      Result += 'A' + ((S[i] >> 4) & 15);
       Result += '_';
     }
   return Result;
 }
 
-raw_ostream &
-CWriter::printTypeString(raw_ostream &Out, Type *Ty, bool isSigned) {
+raw_ostream &CWriter::printTypeString(raw_ostream &Out, Type *Ty,
+                                      bool isSigned) {
   if (StructType *ST = dyn_cast<StructType>(Ty)) {
     assert(!isEmptyType(ST));
     TypedefDeclTypes.insert(Ty);
@@ -225,46 +224,51 @@ CWriter::printTypeString(raw_ostream &Out, Type *Ty, bool isSigned) {
   }
 
   switch (Ty->getTypeID()) {
-    case Type::VoidTyID:   return Out << "void";
-    case Type::IntegerTyID: {
-                              unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-                              if (NumBits == 1)
-                                return Out << "bool";
-                              else {
-                                assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
-                                return Out << (isSigned?"i":"u") << NumBits;
-                              }
-                            }
-    case Type::FloatTyID:    return Out << "f32";
-    case Type::DoubleTyID:   return Out << "f64";
-    case Type::X86_FP80TyID: return Out << "f80";
-    case Type::PPC_FP128TyID:
-    case Type::FP128TyID:    return Out << "f128";
-
-    case Type::X86_MMXTyID:
-                             return Out << (isSigned ? "i32y2" : "u32y2");
-
-    case Type::VectorTyID: {
-                             TypedefDeclTypes.insert(Ty);
-                             VectorType *VTy = cast<VectorType>(Ty);
-                             assert(VTy->getNumElements() != 0);
-                             printTypeString(Out, VTy->getElementType(), isSigned);
-                             return Out << "x" << VTy->getNumElements();
-                           }
-
-    case Type::ArrayTyID: {
-                            TypedefDeclTypes.insert(Ty);
-                            ArrayType *ATy = cast<ArrayType>(Ty);
-                            assert(ATy->getNumElements() != 0);
-                            printTypeString(Out, ATy->getElementType(), isSigned);
-                            return Out << "a" << ATy->getNumElements();
-                          }
+  case Type::VoidTyID:
+    return Out << "void";
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return Out << "bool";
+    else {
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned ? "i" : "u") << NumBits;
+    }
+  }
+  case Type::FloatTyID:
+    return Out << "f32";
+  case Type::DoubleTyID:
+    return Out << "f64";
+  case Type::X86_FP80TyID:
+    return Out << "f80";
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+    return Out << "f128";
 
-    default:
+  case Type::X86_MMXTyID:
+    return Out << (isSigned ? "i32y2" : "u32y2");
+
+  case Type::VectorTyID: {
+    TypedefDeclTypes.insert(Ty);
+    VectorType *VTy = cast<VectorType>(Ty);
+    assert(VTy->getNumElements() != 0);
+    printTypeString(Out, VTy->getElementType(), isSigned);
+    return Out << "x" << VTy->getNumElements();
+  }
+
+  case Type::ArrayTyID: {
+    TypedefDeclTypes.insert(Ty);
+    ArrayType *ATy = cast<ArrayType>(Ty);
+    assert(ATy->getNumElements() != 0);
+    printTypeString(Out, ATy->getElementType(), isSigned);
+    return Out << "a" << ATy->getNumElements();
+  }
+
+  default:
 #ifndef NDEBUG
-                          errs() << "Unknown primitive type: " << *Ty << "\n";
+    errs() << "Unknown primitive type: " << *Ty << "\n";
 #endif
-                          llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
@@ -279,8 +283,9 @@ std::string CWriter::getStructName(StructType *ST) {
   return "struct l_unnamed_" + utostr(id);
 }
 
-std::string CWriter::getFunctionName(FunctionType *FT,
-    std::pair<AttributeList, CallingConv::ID> PAL) {
+std::string
+CWriter::getFunctionName(FunctionType *FT,
+                         std::pair<AttributeList, CallingConv::ID> PAL) {
   unsigned &id = UnnamedFunctionIDs[std::make_pair(FT, PAL)];
   if (id == 0)
     id = ++NextFunctionNumber;
@@ -294,7 +299,8 @@ std::string CWriter::getArrayName(ArrayType *AT) {
   // value semantics (avoiding the array "decay").
   assert(!isEmptyType(AT));
   printTypeName(ArrayInnards, AT->getElementType(), false);
-  return "struct l_array_" + utostr(AT->getNumElements()) + '_' + CBEMangle(ArrayInnards.str());
+  return "struct l_array_" + utostr(AT->getNumElements()) + '_' +
+         CBEMangle(ArrayInnards.str());
 }
 
 std::string CWriter::getVectorName(VectorType *VT, bool Aligned) {
@@ -305,95 +311,125 @@ std::string CWriter::getVectorName(VectorType *VT, bool Aligned) {
   //    if (Aligned)
   //      Out << "__MSALIGN__(" << TD->getABITypeAlignment(VT) << ") ";
   printTypeName(VectorInnards, VT->getElementType(), false);
-  return "struct l_vector_" + utostr(VT->getNumElements()) + '_' + CBEMangle(VectorInnards.str());
+  return "struct l_vector_" + utostr(VT->getNumElements()) + '_' +
+         CBEMangle(VectorInnards.str());
 }
 
-
 static const std::string getCmpPredicateName(CmpInst::Predicate P) {
   switch (P) {
-    case FCmpInst::FCMP_FALSE: return "0";
-    case FCmpInst::FCMP_OEQ: return "oeq";
-    case FCmpInst::FCMP_OGT: return "ogt";
-    case FCmpInst::FCMP_OGE: return "oge";
-    case FCmpInst::FCMP_OLT: return "olt";
-    case FCmpInst::FCMP_OLE: return "ole";
-    case FCmpInst::FCMP_ONE: return "one";
-    case FCmpInst::FCMP_ORD: return "ord";
-    case FCmpInst::FCMP_UNO: return "uno";
-    case FCmpInst::FCMP_UEQ: return "ueq";
-    case FCmpInst::FCMP_UGT: return "ugt";
-    case FCmpInst::FCMP_UGE: return "uge";
-    case FCmpInst::FCMP_ULT: return "ult";
-    case FCmpInst::FCMP_ULE: return "ule";
-    case FCmpInst::FCMP_UNE: return "une";
-    case FCmpInst::FCMP_TRUE: return "1";
-    case ICmpInst::ICMP_EQ:  return "eq";
-    case ICmpInst::ICMP_NE:  return "ne";
-    case ICmpInst::ICMP_ULE: return "ule";
-    case ICmpInst::ICMP_SLE: return "sle";
-    case ICmpInst::ICMP_UGE: return "uge";
-    case ICmpInst::ICMP_SGE: return "sge";
-    case ICmpInst::ICMP_ULT: return "ult";
-    case ICmpInst::ICMP_SLT: return "slt";
-    case ICmpInst::ICMP_UGT: return "ugt";
-    case ICmpInst::ICMP_SGT: return "sgt";
-    default:
+  case FCmpInst::FCMP_FALSE:
+    return "0";
+  case FCmpInst::FCMP_OEQ:
+    return "oeq";
+  case FCmpInst::FCMP_OGT:
+    return "ogt";
+  case FCmpInst::FCMP_OGE:
+    return "oge";
+  case FCmpInst::FCMP_OLT:
+    return "olt";
+  case FCmpInst::FCMP_OLE:
+    return "ole";
+  case FCmpInst::FCMP_ONE:
+    return "one";
+  case FCmpInst::FCMP_ORD:
+    return "ord";
+  case FCmpInst::FCMP_UNO:
+    return "uno";
+  case FCmpInst::FCMP_UEQ:
+    return "ueq";
+  case FCmpInst::FCMP_UGT:
+    return "ugt";
+  case FCmpInst::FCMP_UGE:
+    return "uge";
+  case FCmpInst::FCMP_ULT:
+    return "ult";
+  case FCmpInst::FCMP_ULE:
+    return "ule";
+  case FCmpInst::FCMP_UNE:
+    return "une";
+  case FCmpInst::FCMP_TRUE:
+    return "1";
+  case ICmpInst::ICMP_EQ:
+    return "eq";
+  case ICmpInst::ICMP_NE:
+    return "ne";
+  case ICmpInst::ICMP_ULE:
+    return "ule";
+  case ICmpInst::ICMP_SLE:
+    return "sle";
+  case ICmpInst::ICMP_UGE:
+    return "uge";
+  case ICmpInst::ICMP_SGE:
+    return "sge";
+  case ICmpInst::ICMP_ULT:
+    return "ult";
+  case ICmpInst::ICMP_SLT:
+    return "slt";
+  case ICmpInst::ICMP_UGT:
+    return "ugt";
+  case ICmpInst::ICMP_SGT:
+    return "sgt";
+  default:
 #ifndef NDEBUG
-                             errs() << "Invalid icmp predicate!" << P;
+    errs() << "Invalid icmp predicate!" << P;
 #endif
-                             llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
-
-raw_ostream &
-CWriter::printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned) {
+raw_ostream &CWriter::printSimpleType(raw_ostream &Out, Type *Ty,
+                                      bool isSigned) {
   assert((Ty->isSingleValueType() || Ty->isVoidTy()) &&
-      "Invalid type for printSimpleType");
+         "Invalid type for printSimpleType");
   switch (Ty->getTypeID()) {
-    case Type::VoidTyID:   return Out << "void";
-    case Type::IntegerTyID: {
-                              unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-                              if (NumBits == 1)
-                                return Out << "bool";
-                              else if (NumBits <= 8)
-                                return Out << (isSigned?"char":"uchar");
-                              else if (NumBits <= 16)
-                                return Out << (isSigned?"short":"ushort");
-                              else if (NumBits <= 32)
-                                return Out << (isSigned?"int":"uint"); // !!FIX ME
-                              else if (NumBits <= 64)
-                                return Out << (isSigned?"long":"ulong");
-                              else {
-                                assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
-                                return Out << (isSigned?"int128_t":"uint128_t");
-                              }
-                            }
-    case Type::FloatTyID:  return Out << "float";
-    case Type::DoubleTyID: return Out << "double";
-                           // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
-                           // present matches host 'long double'.
-    case Type::X86_FP80TyID:
-    case Type::PPC_FP128TyID:
-    case Type::FP128TyID:  return Out << "long double";
-
-    case Type::X86_MMXTyID:
-                           return Out << (isSigned?"int":"uint") << " __attribute__((vector_size(8)))";
-
-    default:
+  case Type::VoidTyID:
+    return Out << "void";
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1)
+      return Out << "bool";
+    else if (NumBits <= 8)
+      return Out << (isSigned ? "char" : "uchar");
+    else if (NumBits <= 16)
+      return Out << (isSigned ? "short" : "ushort");
+    else if (NumBits <= 32)
+      return Out << (isSigned ? "int" : "uint"); // !!FIX ME
+    else if (NumBits <= 64)
+      return Out << (isSigned ? "long" : "ulong");
+    else {
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned ? "int128_t" : "uint128_t");
+    }
+  }
+  case Type::FloatTyID:
+    return Out << "float";
+  case Type::DoubleTyID:
+    return Out << "double";
+    // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
+    // present matches host 'long double'.
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+    return Out << "long double";
+
+  case Type::X86_MMXTyID:
+    return Out << (isSigned ? "int" : "uint")
+               << " __attribute__((vector_size(8)))";
+
+  default:
 #ifndef NDEBUG
-                           errs() << "Unknown primitive type: " << *Ty << "\n";
+    errs() << "Unknown primitive type: " << *Ty << "\n";
 #endif
-                           llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
 // Pass the Type* and the variable name and this prints out the variable
 // declaration.
 //
-raw_ostream &CWriter::printTypeName(raw_ostream &Out, Type *Ty,
-    bool isSigned,
-    std::pair<AttributeList, CallingConv::ID> PAL) {
+raw_ostream &
+CWriter::printTypeName(raw_ostream &Out, Type *Ty, bool isSigned,
+                       std::pair<AttributeList, CallingConv::ID> PAL) {
 
   if (Ty->isSingleValueType() || Ty->isVoidTy()) {
     if (!Ty->isPointerTy() && !Ty->isVectorTy())
@@ -404,39 +440,40 @@ raw_ostream &CWriter::printTypeName(raw_ostream &Out, Type *Ty,
     return Out << "void";
 
   switch (Ty->getTypeID()) {
-    case Type::FunctionTyID: {
-                               FunctionType *FTy = cast<FunctionType>(Ty);
-                               return Out << getFunctionName(FTy, PAL);
-                             }
-    case Type::StructTyID: {
-                             TypedefDeclTypes.insert(Ty);
-                             return Out << getStructName(cast<StructType>(Ty));
-                           }
-
-    case Type::PointerTyID: {
-                              Type *ElTy = Ty->getPointerElementType();
-                              return printTypeName(Out, ElTy, false) << '*';
-                            }
-
-    case Type::ArrayTyID: {
-                            TypedefDeclTypes.insert(Ty);
-                            return Out << getArrayName(cast<ArrayType>(Ty));
-                          }
-
-    case Type::VectorTyID: {
-                             TypedefDeclTypes.insert(Ty);
-                             return Out << getVectorName(cast<VectorType>(Ty), true);
-                           }
+  case Type::FunctionTyID: {
+    FunctionType *FTy = cast<FunctionType>(Ty);
+    return Out << getFunctionName(FTy, PAL);
+  }
+  case Type::StructTyID: {
+    TypedefDeclTypes.insert(Ty);
+    return Out << getStructName(cast<StructType>(Ty));
+  }
 
-    default:
+  case Type::PointerTyID: {
+    Type *ElTy = Ty->getPointerElementType();
+    return printTypeName(Out, ElTy, false) << '*';
+  }
+
+  case Type::ArrayTyID: {
+    TypedefDeclTypes.insert(Ty);
+    return Out << getArrayName(cast<ArrayType>(Ty));
+  }
+
+  case Type::VectorTyID: {
+    TypedefDeclTypes.insert(Ty);
+    return Out << getVectorName(cast<VectorType>(Ty), true);
+  }
+
+  default:
 #ifndef NDEBUG
-                           errs() << "Unexpected type: " << *Ty << "\n";
+    errs() << "Unexpected type: " << *Ty << "\n";
 #endif
-                           llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 }
 
-raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool isSigned) {
+raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty,
+                                             bool isSigned) {
   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
     // MSVC doesn't handle __declspec(align) on parameters,
     // but we specify it for Vector (hoping the compiler will vectorize it)
@@ -447,13 +484,15 @@ raw_ostream &CWriter::printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool is
   return printTypeName(Out, Ty, isSigned);
 }
 
-raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out, StructType *STy) {
+raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out,
+                                             StructType *STy) {
   if (STy->isPacked())
     Out << "#ifdef _MSC_VER\n#pragma pack(push, 1)\n#endif\n";
   Out << getStructName(STy) << " {\n";
   unsigned Idx = 0;
   for (StructType::element_iterator I = STy->element_begin(),
-      E = STy->element_end(); I != E; ++I, Idx++) {
+                                    E = STy->element_end();
+       I != E; ++I, Idx++) {
     Out << "  ";
     bool empty = isEmptyType(*I);
     if (empty)
@@ -473,21 +512,23 @@ raw_ostream &CWriter::printStructDeclaration(raw_ostream &Out, StructType *STy)
   return Out;
 }
 
-raw_ostream &CWriter::printFunctionDeclaration(raw_ostream &Out, FunctionType *Ty,
-    std::pair<AttributeList, CallingConv::ID> PAL){
+raw_ostream &CWriter::printFunctionDeclaration(
+    raw_ostream &Out, FunctionType *Ty,
+    std::pair<AttributeList, CallingConv::ID> PAL) {
 
   Out << "typedef ";
   printFunctionProto(Out, Ty, PAL, getFunctionName(Ty, PAL), NULL, false);
   return Out << ";\n";
 }
 
-raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
-    std::pair<AttributeList, CallingConv::ID> Attrs,
-    const std::string &Name,
-    Function::arg_iterator ArgList,
-    bool isKernel) {
+raw_ostream &
+CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
+                            std::pair<AttributeList, CallingConv::ID> Attrs,
+                            const std::string &Name,
+                            Function::arg_iterator ArgList, bool isKernel) {
 
-  // NOTE: AttributeSet is replaced by 'AttributeList' at function level in LLVM-9
+  // NOTE: AttributeSet is replaced by 'AttributeList' at function level in
+  // LLVM-9
   AttributeList &PAL = Attrs.first;
 
   if (PAL.hasAttribute(AttributeList::FunctionIndex, Attribute::NoReturn))
@@ -498,7 +539,7 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
 
   // Should this function actually return a struct by-value?
   bool isStructReturn = PAL.hasAttribute(1, Attribute::StructRet) ||
-    PAL.hasAttribute(2, Attribute::StructRet);
+                        PAL.hasAttribute(2, Attribute::StructRet);
   // Get the return type for the function.
   Type *RetTy;
   if (!isStructReturn)
@@ -508,24 +549,25 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
     RetTy = cast<PointerType>(FTy->getParamType(0))->getElementType();
   }
   printTypeName(Out, RetTy,
-      /*isSigned=*/PAL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt));
+                /*isSigned=*/
+                PAL.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt));
 
   Out << "/* Processing Function: " << Name << ": " << Attrs.second << "*/\n";
   switch (Attrs.second) {
-    case CallingConv::C:
-      break;
-    case CallingConv::X86_StdCall:
-      Out << " __stdcall";
-      break;
-    case CallingConv::X86_FastCall:
-      Out << " __fastcall";
-      break;
-    case CallingConv::X86_ThisCall:
-      Out << " __thiscall";
-      break;
-    default:
-      //    assert(0 && "Encountered Unhandled Calling Convention");
-      break;
+  case CallingConv::C:
+    break;
+  case CallingConv::X86_StdCall:
+    Out << " __stdcall";
+    break;
+  case CallingConv::X86_FastCall:
+    Out << " __fastcall";
+    break;
+  case CallingConv::X86_ThisCall:
+    Out << " __thiscall";
+    break;
+  default:
+    //    assert(0 && "Encountered Unhandled Calling Convention");
+    break;
   }
   Out << ' ' << Name << '(';
 
@@ -533,7 +575,8 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
   bool PrintedArg = false;
   FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
 
-  //Function::arg_iterator ArgName = ArgList ? ArgList->begin() : Function::arg_iterator();
+  // Function::arg_iterator ArgName = ArgList ? ArgList->begin() :
+  // Function::arg_iterator();
   // NOTE: ArgumentLists not supported in LLVM-9
   Function::arg_iterator ArgName = ArgList ? ArgList : Function::arg_iterator();
 
@@ -544,8 +587,10 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
     assert(I != E && "Invalid struct return function!");
     ++I;
     ++Idx;
-    // CHECK: very confused as to how next loop starts from first Function Param?
-    if (ArgList) ++ArgName;
+    // CHECK: very confused as to how next loop starts from first Function
+    // Param?
+    if (ArgList)
+      ++ArgName;
   }
 
   for (; I != E; ++I) {
@@ -559,27 +604,27 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
 
     if (PointerType *PTy = dyn_cast<PointerType>(ArgTy)) {
       unsigned AddrSpace = PTy->getAddressSpace();
-      //DEBUG(errs() << "AddrSpace for " << Idx << " = " << AddrSpace << "\n");
-      switch(AddrSpace) {
-        case GLOBAL_ADDRSPACE:
-          Out << "__global ";
-          break;
-        case SHARED_ADDRSPACE:
-          Out << "__local ";
-          break;
-        case CONSTANT_ADDRSPACE:
-          Out << "__constant ";
-          break;
-        case PRIVATE_ADDRSPACE:
-          Out << "__private ";
-          break;
-        default:
-          break;
+      // DEBUG(errs() << "AddrSpace for " << Idx << " = " << AddrSpace << "\n");
+      switch (AddrSpace) {
+      case GLOBAL_ADDRSPACE:
+        Out << "__global ";
+        break;
+      case SHARED_ADDRSPACE:
+        Out << "__local ";
+        break;
+      case CONSTANT_ADDRSPACE:
+        Out << "__constant ";
+        break;
+      case PRIVATE_ADDRSPACE:
+        Out << "__private ";
+        break;
+      default:
+        break;
       }
     }
 
     printTypeNameUnaligned(Out, ArgTy,
-        /*isSigned=*/PAL.hasAttribute(Idx, Attribute::SExt));
+                           /*isSigned=*/PAL.hasAttribute(Idx, Attribute::SExt));
     PrintedArg = true;
     bool noalias = false;
     if (PAL.hasAttribute(Idx, Attribute::NoAlias)) {
@@ -588,15 +633,16 @@ raw_ostream &CWriter::printFunctionProto(raw_ostream &Out, FunctionType *FTy,
     ++Idx;
     if (ArgList) {
 
-      Out << ' ' << (noalias ? " restrict " : "")  << GetValueName(&*ArgName);
+      Out << ' ' << (noalias ? " restrict " : "") << GetValueName(&*ArgName);
       ++ArgName;
     }
   }
 
   if (FTy->isVarArg()) {
     if (!PrintedArg) {
-      Out << "int"; //dummy argument for empty vaarg functs
-      if (ArgList) Out << " vararg_dummy_arg";
+      Out << "int"; // dummy argument for empty vaarg functs
+      if (ArgList)
+        Out << " vararg_dummy_arg";
     }
     Out << ", ...";
   } else if (!PrintedArg) {
@@ -616,16 +662,20 @@ raw_ostream &CWriter::printArrayDeclaration(raw_ostream &Out, ArrayType *ATy) {
   return Out;
 }
 
-raw_ostream &CWriter::printVectorDeclaration(raw_ostream &Out, VectorType *VTy) {
+raw_ostream &CWriter::printVectorDeclaration(raw_ostream &Out,
+                                             VectorType *VTy) {
   assert(!isEmptyType(VTy));
   // Vectors are printed like arrays
   Out << getVectorName(VTy, false) << " {\n  ";
   printTypeName(Out, VTy->getElementType());
-  Out << " vector[" << utostr(VTy->getNumElements()) << "];\n} __attribute__((aligned(" << TD->getABITypeAlignment(VTy) << ")));\n";
+  Out << " vector[" << utostr(VTy->getNumElements())
+      << "];\n} __attribute__((aligned(" << TD->getABITypeAlignment(VTy)
+      << ")));\n";
   return Out;
 }
 
-void CWriter::printConstantArray(ConstantArray *CPA, enum OperandContext Context) {
+void CWriter::printConstantArray(ConstantArray *CPA,
+                                 enum OperandContext Context) {
   printConstant(cast<Constant>(CPA->getOperand(0)), Context);
   for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
     Out << ", ";
@@ -633,7 +683,8 @@ void CWriter::printConstantArray(ConstantArray *CPA, enum OperandContext Context
   }
 }
 
-void CWriter::printConstantVector(ConstantVector *CP, enum OperandContext Context) {
+void CWriter::printConstantVector(ConstantVector *CP,
+                                  enum OperandContext Context) {
   printConstant(cast<Constant>(CP->getOperand(0)), Context);
   for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
     Out << ", ";
@@ -641,7 +692,8 @@ void CWriter::printConstantVector(ConstantVector *CP, enum OperandContext Contex
   }
 }
 
-void CWriter::printConstantDataSequential(ConstantDataSequential *CDS, enum OperandContext Context) {
+void CWriter::printConstantDataSequential(ConstantDataSequential *CDS,
+                                          enum OperandContext Context) {
   printConstant(CDS->getElementAsConstant(0), Context);
   for (unsigned i = 1, e = CDS->getNumElements(); i != e; ++i) {
     Out << ", ";
@@ -653,8 +705,10 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
   // As a special case, print the array as a string if it is an array of
   // ubytes or an array of sbytes with positive values.
   ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(C);
-  if (!CDS || !CDS->isCString()) return false;
-  if (Context != ContextStatic) return false; // TODO
+  if (!CDS || !CDS->isCString())
+    return false;
+  if (Context != ContextStatic)
+    return false; // TODO
 
   Out << "{ \"";
   // Keep track of whether the last number was a hexadecimal escape.
@@ -681,19 +735,34 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
     } else {
       LastWasHex = false;
       switch (C) {
-        case '\n': Out << "\\n"; break;
-        case '\t': Out << "\\t"; break;
-        case '\r': Out << "\\r"; break;
-        case '\v': Out << "\\v"; break;
-        case '\a': Out << "\\a"; break;
-        case '\"': Out << "\\\""; break;
-        case '\'': Out << "\\\'"; break;
-        default:
-                   Out << "\\x";
-                   Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
-                   Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
-                   LastWasHex = true;
-                   break;
+      case '\n':
+        Out << "\\n";
+        break;
+      case '\t':
+        Out << "\\t";
+        break;
+      case '\r':
+        Out << "\\r";
+        break;
+      case '\v':
+        Out << "\\v";
+        break;
+      case '\a':
+        Out << "\\a";
+        break;
+      case '\"':
+        Out << "\\\"";
+        break;
+      case '\'':
+        Out << "\\\'";
+        break;
+      default:
+        Out << "\\x";
+        Out << (char)((C / 16 < 10) ? (C / 16 + '0') : (C / 16 - 10 + 'A'));
+        Out << (char)(((C & 15) < 10) ? ((C & 15) + '0')
+                                      : ((C & 15) - 10 + 'A'));
+        LastWasHex = true;
+        break;
       }
     }
   }
@@ -701,7 +770,6 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
   return true;
 }
 
-
 // isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
 // textually as a double (rather than as a reference to a stack-allocated
 // variable). We decide this by converting CFP to a string and back into a
@@ -712,7 +780,7 @@ bool CWriter::printConstantString(Constant *C, enum OperandContext Context) {
 //
 
 // TODO copied from CppBackend, new code should use raw_ostream
-static inline std::string ftostr(const APFloat& V) {
+static inline std::string ftostr(const APFloat &V) {
   std::string Buf;
   if (&V.getSemantics() == &APFloat::IEEEdouble()) {
     raw_string_ostream(Buf) << V.convertToDouble();
@@ -730,14 +798,13 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) {
   if (CFP->getType() != Type::getFloatTy(CFP->getContext()) &&
       CFP->getType() != Type::getDoubleTy(CFP->getContext()))
     return false;
-  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  APFloat APF = APFloat(CFP->getValueAPF()); // copy
   if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
     APF.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, &ignored);
 #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
   char Buffer[100];
   sprintf(Buffer, "%a", APF.convertToDouble());
-  if (!strncmp(Buffer, "0x", 2) ||
-      !strncmp(Buffer, "-0x", 3) ||
+  if (!strncmp(Buffer, "0x", 2) || !strncmp(Buffer, "-0x", 3) ||
       !strncmp(Buffer, "+0x", 3))
     return APF.bitwiseIsEqual(APFloat(atof(Buffer)));
   return false;
@@ -764,211 +831,249 @@ static bool isFPCSafeToPrint(const ConstantFP *CFP) {
 void CWriter::printCast(unsigned opc, Type *SrcTy, Type *DstTy) {
   // Print the destination type cast
   switch (opc) {
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::IntToPtr:
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::FPExt:
-    case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
-      Out << '(';
-      printTypeName(Out, DstTy);
-      Out << ')';
-      break;
-    case Instruction::ZExt:
-    case Instruction::PtrToInt:
-    case Instruction::FPToUI: // For these, make sure we get an unsigned dest
-      Out << '(';
-      printSimpleType(Out, DstTy, false);
-      Out << ')';
-      break;
-    case Instruction::SExt:
-    case Instruction::FPToSI: // For these, make sure we get a signed dest
-      Out << '(';
-      printSimpleType(Out, DstTy, true);
-      Out << ')';
-      break;
-    default:
-      llvm_unreachable("Invalid cast opcode");
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::IntToPtr:
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::FPExt:
+  case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
+    Out << '(';
+    printTypeName(Out, DstTy);
+    Out << ')';
+    break;
+  case Instruction::ZExt:
+  case Instruction::PtrToInt:
+  case Instruction::FPToUI: // For these, make sure we get an unsigned dest
+    Out << '(';
+    printSimpleType(Out, DstTy, false);
+    Out << ')';
+    break;
+  case Instruction::SExt:
+  case Instruction::FPToSI: // For these, make sure we get a signed dest
+    Out << '(';
+    printSimpleType(Out, DstTy, true);
+    Out << ')';
+    break;
+  default:
+    llvm_unreachable("Invalid cast opcode");
   }
 
   // Print the source type cast
   switch (opc) {
-    case Instruction::UIToFP:
-    case Instruction::ZExt:
-      Out << '(';
-      printSimpleType(Out, SrcTy, false);
-      Out << ')';
-      break;
-    case Instruction::SIToFP:
-    case Instruction::SExt:
-      Out << '(';
-      printSimpleType(Out, SrcTy, true);
-      Out << ')';
-      break;
-    case Instruction::IntToPtr:
-    case Instruction::PtrToInt:
-      // Avoid "cast to pointer from integer of different size" warnings
-      Out << "(uintptr_t)";
-      break;
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::FPExt:
-    case Instruction::FPTrunc:
-    case Instruction::FPToSI:
-    case Instruction::FPToUI:
-      break; // These don't need a source cast.
-    default:
-      llvm_unreachable("Invalid cast opcode");
+  case Instruction::UIToFP:
+  case Instruction::ZExt:
+    Out << '(';
+    printSimpleType(Out, SrcTy, false);
+    Out << ')';
+    break;
+  case Instruction::SIToFP:
+  case Instruction::SExt:
+    Out << '(';
+    printSimpleType(Out, SrcTy, true);
+    Out << ')';
+    break;
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+    // Avoid "cast to pointer from integer of different size" warnings
+    Out << "(uintptr_t)";
+    break;
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::FPExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPToSI:
+  case Instruction::FPToUI:
+    break; // These don't need a source cast.
+  default:
+    llvm_unreachable("Invalid cast opcode");
   }
 }
 
 // printConstant - The LLVM Constant to C Constant converter.
 void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
-    assert(CE->getType()->isIntegerTy() || CE->getType()->isFloatingPointTy() || CE->getType()->isPointerTy()); // TODO: VectorType are valid here, but not supported
+    assert(CE->getType()->isIntegerTy() || CE->getType()->isFloatingPointTy() ||
+           CE->getType()->isPointerTy()); // TODO: VectorType are valid here,
+                                          // but not supported
     GetElementPtrInst *GEPI;
     switch (CE->getOpcode()) {
-      case Instruction::Trunc:
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPTrunc:
-      case Instruction::FPExt:
-      case Instruction::UIToFP:
-      case Instruction::SIToFP:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::BitCast:
-        Out << "(";
-        printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
-        if (CE->getOpcode() == Instruction::SExt &&
-            CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
-          // Make sure we really sext from bool here by subtracting from 0
-          Out << "0-";
-        }
-        printConstant(CE->getOperand(0), ContextCasted);
-        if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
-            (CE->getOpcode() == Instruction::Trunc ||
-             CE->getOpcode() == Instruction::FPToUI ||
-             CE->getOpcode() == Instruction::FPToSI ||
-             CE->getOpcode() == Instruction::PtrToInt)) {
-          // Make sure we really truncate to bool here by anding with 1
-          Out << "&1u";
-        }
-        Out << ')';
-        return;
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      Out << "(";
+      printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
+      if (CE->getOpcode() == Instruction::SExt &&
+          CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
+        // Make sure we really sext from bool here by subtracting from 0
+        Out << "0-";
+      }
+      printConstant(CE->getOperand(0), ContextCasted);
+      if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
+          (CE->getOpcode() == Instruction::Trunc ||
+           CE->getOpcode() == Instruction::FPToUI ||
+           CE->getOpcode() == Instruction::FPToSI ||
+           CE->getOpcode() == Instruction::PtrToInt)) {
+        // Make sure we really truncate to bool here by anding with 1
+        Out << "&1u";
+      }
+      Out << ')';
+      return;
 
-      case Instruction::GetElementPtr:
-        Out << "(";
-        //DEBUG(errs() << "\n----------\nCE: " << *CE << "\n");
-        GEPI = dyn_cast<GetElementPtrInst>(CE->getAsInstruction());
-        //DEBUG(errs() << "GEPI: " << *GEPI << "\n");
-        printGEPExpression(CE->getOperand(0), gep_type_begin(CPV), gep_type_end(CPV), CE->getOperand(0)->getType()->isArrayTy(), GEPI);
-        delete(GEPI);
-        //DEBUG(errs() << "Deleted GEPI!\n");
-        Out << ")";
-        return;
-      case Instruction::Select:
-        Out << '(';
-        printConstant(CE->getOperand(0), ContextCasted);
-        Out << '?';
-        printConstant(CE->getOperand(1), ContextNormal);
-        Out << ':';
-        printConstant(CE->getOperand(2), ContextNormal);
-        Out << ')';
-        return;
+    case Instruction::GetElementPtr:
+      Out << "(";
+      // DEBUG(errs() << "\n----------\nCE: " << *CE << "\n");
+      GEPI = dyn_cast<GetElementPtrInst>(CE->getAsInstruction());
+      // DEBUG(errs() << "GEPI: " << *GEPI << "\n");
+      printGEPExpression(CE->getOperand(0), gep_type_begin(CPV),
+                         gep_type_end(CPV),
+                         CE->getOperand(0)->getType()->isArrayTy(), GEPI);
+      delete (GEPI);
+      // DEBUG(errs() << "Deleted GEPI!\n");
+      Out << ")";
+      return;
+    case Instruction::Select:
+      Out << '(';
+      printConstant(CE->getOperand(0), ContextCasted);
+      Out << '?';
+      printConstant(CE->getOperand(1), ContextNormal);
+      Out << ':';
+      printConstant(CE->getOperand(2), ContextNormal);
+      Out << ')';
+      return;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE);
+      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+      switch (CE->getOpcode()) {
       case Instruction::Add:
       case Instruction::FAdd:
+        Out << " + ";
+        break;
       case Instruction::Sub:
       case Instruction::FSub:
+        Out << " - ";
+        break;
       case Instruction::Mul:
       case Instruction::FMul:
-      case Instruction::SDiv:
-      case Instruction::UDiv:
-      case Instruction::FDiv:
+        Out << " * ";
+        break;
       case Instruction::URem:
       case Instruction::SRem:
       case Instruction::FRem:
+        Out << " % ";
+        break;
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv:
+        Out << " / ";
+        break;
       case Instruction::And:
+        Out << " & ";
+        break;
       case Instruction::Or:
+        Out << " | ";
+        break;
       case Instruction::Xor:
-      case Instruction::ICmp:
+        Out << " ^ ";
+        break;
       case Instruction::Shl:
+        Out << " << ";
+        break;
       case Instruction::LShr:
       case Instruction::AShr:
-        {
-          Out << '(';
-          bool NeedsClosingParens = printConstExprCast(CE);
-          printConstantWithCast(CE->getOperand(0), CE->getOpcode());
-          switch (CE->getOpcode()) {
-            case Instruction::Add:
-            case Instruction::FAdd: Out << " + "; break;
-            case Instruction::Sub:
-            case Instruction::FSub: Out << " - "; break;
-            case Instruction::Mul:
-            case Instruction::FMul: Out << " * "; break;
-            case Instruction::URem:
-            case Instruction::SRem:
-            case Instruction::FRem: Out << " % "; break;
-            case Instruction::UDiv:
-            case Instruction::SDiv:
-            case Instruction::FDiv: Out << " / "; break;
-            case Instruction::And: Out << " & "; break;
-            case Instruction::Or:  Out << " | "; break;
-            case Instruction::Xor: Out << " ^ "; break;
-            case Instruction::Shl: Out << " << "; break;
-            case Instruction::LShr:
-            case Instruction::AShr: Out << " >> "; break;
-            case Instruction::ICmp:
-                                    switch (CE->getPredicate()) {
-                                      case ICmpInst::ICMP_EQ: Out << " == "; break;
-                                      case ICmpInst::ICMP_NE: Out << " != "; break;
-                                      case ICmpInst::ICMP_SLT:
-                                      case ICmpInst::ICMP_ULT: Out << " < "; break;
-                                      case ICmpInst::ICMP_SLE:
-                                      case ICmpInst::ICMP_ULE: Out << " <= "; break;
-                                      case ICmpInst::ICMP_SGT:
-                                      case ICmpInst::ICMP_UGT: Out << " > "; break;
-                                      case ICmpInst::ICMP_SGE:
-                                      case ICmpInst::ICMP_UGE: Out << " >= "; break;
-                                      default: llvm_unreachable("Illegal ICmp predicate");
-                                    }
-                                    break;
-            default: llvm_unreachable("Illegal opcode here!");
-          }
-          printConstantWithCast(CE->getOperand(1), CE->getOpcode());
-          if (NeedsClosingParens)
-            Out << "))";
-          Out << ')';
-          return;
+        Out << " >> ";
+        break;
+      case Instruction::ICmp:
+        switch (CE->getPredicate()) {
+        case ICmpInst::ICMP_EQ:
+          Out << " == ";
+          break;
+        case ICmpInst::ICMP_NE:
+          Out << " != ";
+          break;
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_ULT:
+          Out << " < ";
+          break;
+        case ICmpInst::ICMP_SLE:
+        case ICmpInst::ICMP_ULE:
+          Out << " <= ";
+          break;
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_UGT:
+          Out << " > ";
+          break;
+        case ICmpInst::ICMP_SGE:
+        case ICmpInst::ICMP_UGE:
+          Out << " >= ";
+          break;
+        default:
+          llvm_unreachable("Illegal ICmp predicate");
         }
-      case Instruction::FCmp: {
-                                Out << '(';
-                                bool NeedsClosingParens = printConstExprCast(CE);
-                                if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
-                                  Out << "0";
-                                else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
-                                  Out << "1";
-                                else {
-                                  Out << "llvm_fcmp_" << getCmpPredicateName((CmpInst::Predicate)CE->getPredicate()) << "(";
-                                  printConstant(CE->getOperand(0), ContextCasted);
-                                  Out << ", ";
-                                  printConstant(CE->getOperand(1), ContextCasted);
-                                  Out << ")";
-                                }
-                                if (NeedsClosingParens)
-                                  Out << "))";
-                                Out << ')';
-                                return;
-                              }
+        break;
       default:
+        llvm_unreachable("Illegal opcode here!");
+      }
+      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    case Instruction::FCmp: {
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE);
+      if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
+        Out << "0";
+      else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
+        Out << "1";
+      else {
+        Out << "llvm_fcmp_"
+            << getCmpPredicateName((CmpInst::Predicate)CE->getPredicate())
+            << "(";
+        printConstant(CE->getOperand(0), ContextCasted);
+        Out << ", ";
+        printConstant(CE->getOperand(1), ContextCasted);
+        Out << ")";
+      }
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    default:
 #ifndef NDEBUG
-                              errs() << "CWriter Error: Unhandled constant expression: "
-                                << *CE << "\n";
+      errs() << "CWriter Error: Unhandled constant expression: " << *CE << "\n";
 #endif
-                              llvm_unreachable(0);
+      llvm_unreachable(0);
     }
   } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
     if (CPV->getType()->isVectorTy()) {
@@ -985,7 +1090,8 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
       Constant *Zero = Constant::getNullValue(VT->getElementType());
       unsigned NumElts = VT->getNumElements();
       for (unsigned i = 0; i != NumElts; ++i) {
-        if (i) Out << ", ";
+        if (i)
+          Out << ", ";
         printConstant(Zero, ContextCasted);
       }
       Out << ")";
@@ -999,9 +1105,10 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
   }
 
   if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
-    Type* Ty = CI->getType();
+    Type *Ty = CI->getType();
     unsigned ActiveBits = CI->getValue().getMinSignedBits();
-    //    DEBUG(errs() << "Here: " << *CI << ", " << *Ty << ", " << ActiveBits << "\n");
+    //    DEBUG(errs() << "Here: " << *CI << ", " << *Ty << ", " << ActiveBits
+    //    << "\n");
     Out << CI->getSExtValue();
     //    if (Ty == Type::getInt1Ty(CPV->getContext())) {
     //      Out << (CI->getZExtValue() ? '1' : '0');
@@ -1014,7 +1121,8 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
     //      Out << CI->getSExtValue(); // most likely a shorter representation
     ////      if (ActiveBits >= 32)
     ////        Out << ")";
-    //    } else if (Ty->getPrimitiveSizeInBits() < 32 && Context == ContextNormal) {
+    //    } else if (Ty->getPrimitiveSizeInBits() < 32 && Context ==
+    //    ContextNormal) {
     //      Out << "((";
     //      printSimpleType(Out, Ty, false) << ')';
     //      if (CI->isMinValue(true))
@@ -1031,248 +1139,266 @@ void CWriter::printConstant(Constant *CPV, enum OperandContext Context) {
     ////      const APInt &V = CI->getValue();
     ////      const APInt &Vlo = V.getLoBits(64);
     ////      const APInt &Vhi = V.getHiBits(64);
-    ////      Out << (Context == ContextStatic ? "UINT128_C" : "llvm_ctor_u128");
-    ////      Out << "(UINT64_C(" << Vhi.getZExtValue() << "), UINT64_C(" << Vlo.getZExtValue() << "))";
+    ////      Out << (Context == ContextStatic ? "UINT128_C" :
+    ///"llvm_ctor_u128"); /      Out << "(UINT64_C(" << Vhi.getZExtValue() <<
+    ///"), UINT64_C(" << Vlo.getZExtValue() << "))";
     //    }
     return;
   }
 
   switch (CPV->getType()->getTypeID()) {
-    case Type::FloatTyID:
-    case Type::DoubleTyID:
-    case Type::X86_FP80TyID:
-    case Type::PPC_FP128TyID:
-    case Type::FP128TyID: {
-                            ConstantFP *FPC = cast<ConstantFP>(CPV);
-                            std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
-                            if (I != FPConstantMap.end()) {
-                              // Because of FP precision problems we must load from a stack allocated
-                              // value that holds the value in hex.
-                              Out << "(*(" << (FPC->getType() == Type::getFloatTy(CPV->getContext()) ?
-                                      "float" :
-                                      FPC->getType() == Type::getDoubleTy(CPV->getContext()) ?
-                                      "double" :
-                                      "long double")
-                                << "*)&FPConstant" << I->second << ')';
-                            } else {
-                              double V;
-                              if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
-                                V = FPC->getValueAPF().convertToFloat();
-                              else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
-                                V = FPC->getValueAPF().convertToDouble();
-                              else {
-                                // Long double.  Convert the number to double, discarding precision.
-                                // This is not awesome, but it at least makes the CBE output somewhat
-                                // useful.
-                                APFloat Tmp = FPC->getValueAPF();
-                                bool LosesInfo;
-                                Tmp.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &LosesInfo);
-                                V = Tmp.convertToDouble();
-                              }
-
-                              if (std::isnan(V)) {
-                                // The value is NaN
-
-                                // FIXME the actual NaN bits should be emitted.
-                                // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
-                                // it's 0x7ff4.
-                                const unsigned long QuietNaN = 0x7ff8UL;
-                                //const unsigned long SignalNaN = 0x7ff4UL;
-
-                                // We need to grab the first part of the FP #
-                                char Buffer[100];
-
-                                uint64_t ll = DoubleToBits(V);
-                                sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
-
-                                std::string Num(&Buffer[0], &Buffer[6]);
-                                unsigned long Val = strtoul(Num.c_str(), 0, 16);
-
-                                if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
-                                  Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
-                                    << Buffer << "\") /*nan*/ ";
-                                else
-                                  Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
-                                    << Buffer << "\") /*nan*/ ";
-                              } else if (std::isinf(V)) {
-                                // The value is Inf
-                                if (V < 0) Out << '-';
-                                Out << "LLVM_INF" <<
-                                  (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" : "")
-                                  << " /*inf*/ ";
-                              } else {
-                                std::string Num;
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID: {
+    ConstantFP *FPC = cast<ConstantFP>(CPV);
+    std::map<const ConstantFP *, unsigned>::iterator I =
+        FPConstantMap.find(FPC);
+    if (I != FPConstantMap.end()) {
+      // Because of FP precision problems we must load from a stack allocated
+      // value that holds the value in hex.
+      Out << "(*("
+          << (FPC->getType() == Type::getFloatTy(CPV->getContext())
+                  ? "float"
+                  : FPC->getType() == Type::getDoubleTy(CPV->getContext())
+                        ? "double"
+                        : "long double")
+          << "*)&FPConstant" << I->second << ')';
+    } else {
+      double V;
+      if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToFloat();
+      else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToDouble();
+      else {
+        // Long double.  Convert the number to double, discarding precision.
+        // This is not awesome, but it at least makes the CBE output somewhat
+        // useful.
+        APFloat Tmp = FPC->getValueAPF();
+        bool LosesInfo;
+        Tmp.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &LosesInfo);
+        V = Tmp.convertToDouble();
+      }
+
+      if (std::isnan(V)) {
+        // The value is NaN
+
+        // FIXME the actual NaN bits should be emitted.
+        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
+        // it's 0x7ff4.
+        const unsigned long QuietNaN = 0x7ff8UL;
+        // const unsigned long SignalNaN = 0x7ff4UL;
+
+        // We need to grab the first part of the FP #
+        char Buffer[100];
+
+        uint64_t ll = DoubleToBits(V);
+        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
+
+        std::string Num(&Buffer[0], &Buffer[6]);
+        unsigned long Val = strtoul(Num.c_str(), 0, 16);
+
+        if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\"" << Buffer
+              << "\") /*nan*/ ";
+        else
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\"" << Buffer
+              << "\") /*nan*/ ";
+      } else if (std::isinf(V)) {
+        // The value is Inf
+        if (V < 0)
+          Out << '-';
+        Out << "LLVM_INF"
+            << (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F"
+                                                                      : "")
+            << " /*inf*/ ";
+      } else {
+        std::string Num;
 #if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
-                                // Print out the constant as a floating point number.
-                                char Buffer[100];
-                                sprintf(Buffer, "%a", V);
-                                Num = Buffer;
+        // Print out the constant as a floating point number.
+        char Buffer[100];
+        sprintf(Buffer, "%a", V);
+        Num = Buffer;
 #else
-                                Num = ftostr(FPC->getValueAPF());
-#endif
-                                Out << Num;
-                              }
-                            }
-                            break;
-                          }
-
-    case Type::ArrayTyID: {
-                            if (printConstantString(CPV, Context)) break;
-                            ArrayType *AT = cast<ArrayType>(CPV->getType());
-                            assert(AT->getNumElements() != 0 && !isEmptyType(AT));
-                            if (Context != ContextStatic) {
-                              CtorDeclTypes.insert(AT);
-                              Out << "llvm_ctor_";
-                              printTypeString(Out, AT, false);
-                              Out << "(";
-                              Context = ContextCasted;
-                            } else {
-                              Out << "{ { "; // Arrays are wrapped in struct types.
-                            }
-                            if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
-                              printConstantArray(CA, Context);
-                            } else if (ConstantDataSequential *CDS =
-                                dyn_cast<ConstantDataSequential>(CPV)) {
-                              printConstantDataSequential(CDS, Context);
-                            } else {
-                              assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-                              Constant *CZ = Constant::getNullValue(AT->getElementType());
-                              printConstant(CZ, Context);
-                              for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
-                                Out << ", ";
-                                printConstant(CZ, Context);
-                              }
-                            }
-                            Out << (Context == ContextStatic ? " } }" : ")"); // Arrays are wrapped in struct types.
-                            break;
-                          }
-
-    case Type::VectorTyID: {
-                             VectorType *VT = cast<VectorType>(CPV->getType());
-                             assert(VT->getNumElements() != 0 && !isEmptyType(VT));
-                             if (Context != ContextStatic) {
-                               CtorDeclTypes.insert(VT);
-                               Out << "llvm_ctor_";
-                               printTypeString(Out, VT, false);
-                               Out << "(";
-                               Context = ContextCasted;
-                             } else {
-                               Out << "{ ";
-                             }
-                             if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
-                               printConstantVector(CV, Context);
-                             } else if (ConstantDataSequential *CDS =
-                                 dyn_cast<ConstantDataSequential>(CPV)) {
-                               printConstantDataSequential(CDS, Context);
-                             } else {
-                               assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
-                               Constant *CZ = Constant::getNullValue(VT->getElementType());
-                               printConstant(CZ, Context);
-                               for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
-                                 Out << ", ";
-                                 printConstant(CZ, Context);
-                               }
-                             }
-                             Out << (Context == ContextStatic ? " }" : ")");
-                             break;
-                           }
-
-    case Type::StructTyID: {
-                             StructType *ST = cast<StructType>(CPV->getType());
-                             assert(!isEmptyType(ST));
-                             if (Context != ContextStatic) {
-                               CtorDeclTypes.insert(ST);
-                               Out << "llvm_ctor_";
-                               printTypeString(Out, ST, false);
-                               Out << "(";
-                               Context = ContextCasted;
-                             } else {
-                               Out << "{ ";
-                             }
-
-                             if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
-                               bool printed = false;
-                               for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-                                 Type *ElTy = ST->getElementType(i);
-                                 if (isEmptyType(ElTy)) continue;
-                                 if (printed) Out << ", ";
-                                 printConstant(Constant::getNullValue(ElTy), Context);
-                                 printed = true;
-                               }
-                               assert(printed);
-                             } else {
-                               bool printed = false;
-                               for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
-                                 Constant *C = cast<Constant>(CPV->getOperand(i));
-                                 if (isEmptyType(C->getType())) continue;
-                                 if (printed) Out << ", ";
-                                 printConstant(C, Context);
-                                 printed = true;
-                               }
-                               assert(printed);
-                             }
-                             Out << (Context == ContextStatic ? " }" : ")");
-                             break;
-                           }
-
-    case Type::PointerTyID:
-                           if (isa<ConstantPointerNull>(CPV)) {
-                             Out << "((";
-                             printTypeName(Out, CPV->getType()); // sign doesn't matter
-                             Out << ")/*NULL*/0)";
-                             break;
-                           } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
-                             writeOperand(GV);
-                             break;
-                           }
-                           // FALL THROUGH
-    default:
-#ifndef NDEBUG
-                           errs() << "Unknown constant type: " << *CPV << "\n";
+        Num = ftostr(FPC->getValueAPF());
 #endif
-                           llvm_unreachable(0);
+        Out << Num;
+      }
+    }
+    break;
   }
-}
 
-// Some constant expressions need to be casted back to the original types
-// because their operands were casted to the expected type. This function takes
-// care of detecting that case and printing the cast for the ConstantExpr.
-bool CWriter::printConstExprCast(ConstantExpr* CE) {
-  bool NeedsExplicitCast = false;
-  Type *Ty = CE->getOperand(0)->getType();
-  bool TypeIsSigned = false;
-  switch (CE->getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::URem:
-    case Instruction::UDiv: NeedsExplicitCast = true; break;
-    case Instruction::AShr:
-    case Instruction::SRem:
-    case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
-    case Instruction::SExt:
-                            Ty = CE->getType();
-                            NeedsExplicitCast = true;
-                            TypeIsSigned = true;
-                            break;
-    case Instruction::ZExt:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::FPExt:
-    case Instruction::UIToFP:
-    case Instruction::SIToFP:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::BitCast:
-                            Ty = CE->getType();
-                            NeedsExplicitCast = true;
-                            break;
-    default: break;
+  case Type::ArrayTyID: {
+    if (printConstantString(CPV, Context))
+      break;
+    ArrayType *AT = cast<ArrayType>(CPV->getType());
+    assert(AT->getNumElements() != 0 && !isEmptyType(AT));
+    if (Context != ContextStatic) {
+      CtorDeclTypes.insert(AT);
+      Out << "llvm_ctor_";
+      printTypeString(Out, AT, false);
+      Out << "(";
+      Context = ContextCasted;
+    } else {
+      Out << "{ { "; // Arrays are wrapped in struct types.
+    }
+    if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
+      printConstantArray(CA, Context);
+    } else if (ConstantDataSequential *CDS =
+                   dyn_cast<ConstantDataSequential>(CPV)) {
+      printConstantDataSequential(CDS, Context);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      Constant *CZ = Constant::getNullValue(AT->getElementType());
+      printConstant(CZ, Context);
+      for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
+        Out << ", ";
+        printConstant(CZ, Context);
+      }
+    }
+    Out << (Context == ContextStatic
+                ? " } }"
+                : ")"); // Arrays are wrapped in struct types.
+    break;
+  }
+
+  case Type::VectorTyID: {
+    VectorType *VT = cast<VectorType>(CPV->getType());
+    assert(VT->getNumElements() != 0 && !isEmptyType(VT));
+    if (Context != ContextStatic) {
+      CtorDeclTypes.insert(VT);
+      Out << "llvm_ctor_";
+      printTypeString(Out, VT, false);
+      Out << "(";
+      Context = ContextCasted;
+    } else {
+      Out << "{ ";
+    }
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+      printConstantVector(CV, Context);
+    } else if (ConstantDataSequential *CDS =
+                   dyn_cast<ConstantDataSequential>(CPV)) {
+      printConstantDataSequential(CDS, Context);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      Constant *CZ = Constant::getNullValue(VT->getElementType());
+      printConstant(CZ, Context);
+      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
+        Out << ", ";
+        printConstant(CZ, Context);
+      }
+    }
+    Out << (Context == ContextStatic ? " }" : ")");
+    break;
+  }
+
+  case Type::StructTyID: {
+    StructType *ST = cast<StructType>(CPV->getType());
+    assert(!isEmptyType(ST));
+    if (Context != ContextStatic) {
+      CtorDeclTypes.insert(ST);
+      Out << "llvm_ctor_";
+      printTypeString(Out, ST, false);
+      Out << "(";
+      Context = ContextCasted;
+    } else {
+      Out << "{ ";
+    }
+
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      bool printed = false;
+      for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+        Type *ElTy = ST->getElementType(i);
+        if (isEmptyType(ElTy))
+          continue;
+        if (printed)
+          Out << ", ";
+        printConstant(Constant::getNullValue(ElTy), Context);
+        printed = true;
+      }
+      assert(printed);
+    } else {
+      bool printed = false;
+      for (unsigned i = 0, e = CPV->getNumOperands(); i != e; ++i) {
+        Constant *C = cast<Constant>(CPV->getOperand(i));
+        if (isEmptyType(C->getType()))
+          continue;
+        if (printed)
+          Out << ", ";
+        printConstant(C, Context);
+        printed = true;
+      }
+      assert(printed);
+    }
+    Out << (Context == ContextStatic ? " }" : ")");
+    break;
+  }
+
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(CPV)) {
+      Out << "((";
+      printTypeName(Out, CPV->getType()); // sign doesn't matter
+      Out << ")/*NULL*/0)";
+      break;
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
+      writeOperand(GV);
+      break;
+    }
+    // FALL THROUGH
+  default:
+#ifndef NDEBUG
+    errs() << "Unknown constant type: " << *CPV << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+// Some constant expressions need to be casted back to the original types
+// because their operands were casted to the expected type. This function takes
+// care of detecting that case and printing the cast for the ConstantExpr.
+bool CWriter::printConstExprCast(ConstantExpr *CE) {
+  bool NeedsExplicitCast = false;
+  Type *Ty = CE->getOperand(0)->getType();
+  bool TypeIsSigned = false;
+  switch (CE->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem:
+  case Instruction::UDiv:
+    NeedsExplicitCast = true;
+    break;
+  case Instruction::AShr:
+  case Instruction::SRem:
+  case Instruction::SDiv:
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::SExt:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::ZExt:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    break;
+  default:
+    break;
   }
   if (NeedsExplicitCast) {
     Out << "((";
@@ -1285,11 +1411,13 @@ bool CWriter::printConstExprCast(ConstantExpr* CE) {
 //  Print a constant assuming that it is the operand for a given Opcode. The
 //  opcodes that care about sign need to cast their operands to the expected
 //  type before the operation proceeds. This function does the casting.
-void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
+void CWriter::printConstantWithCast(Constant *CPV, unsigned Opcode) {
 
   // Extract the operand's type, we'll need it.
-  Type* OpTy = CPV->getType();
-  assert(OpTy->isIntegerTy() || OpTy->isFloatingPointTy()); // TODO: VectorType are valid here, but not supported
+  Type *OpTy = CPV->getType();
+  assert(OpTy->isIntegerTy() ||
+         OpTy->isFloatingPointTy()); // TODO: VectorType are valid here, but not
+                                     // supported
 
   // Indicate whether to do the cast or not.
   bool shouldCast;
@@ -1309,7 +1437,7 @@ void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
 }
 
 std::string CWriter::GetValueName(Value *Operand) {
-  //DEBUG(errs() << "In getvaluename: " << *Operand << "\n");
+  // DEBUG(errs() << "In getvaluename: " << *Operand << "\n");
 
   // Resolve potential alias.
   if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Operand)) {
@@ -1332,8 +1460,7 @@ std::string CWriter::GetValueName(Value *Operand) {
   std::string VarName;
   VarName.reserve(Name.capacity());
 
-  for (std::string::iterator I = Name.begin(), E = Name.end();
-      I != E; ++I) {
+  for (std::string::iterator I = Name.begin(), E = Name.end(); I != E; ++I) {
     unsigned char ch = *I;
 
     if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
@@ -1357,7 +1484,7 @@ void CWriter::writeInstComputationInline(Instruction &I) {
   unsigned mask = 0;
   Type *Ty = I.getType();
   if (Ty->isIntegerTy()) {
-    IntegerType *ITy = static_cast<IntegerType*>(Ty);
+    IntegerType *ITy = static_cast<IntegerType *>(Ty);
     if (!ITy->isPowerOf2ByteWidth())
       mask = ITy->getBitMask();
   }
@@ -1375,20 +1502,20 @@ void CWriter::writeInstComputationInline(Instruction &I) {
     Out << ")&" << mask << ")";
 }
 
-
-void CWriter::writeOperandInternal(Value *Operand, enum OperandContext Context) {
-  //DEBUG(errs() << "In write operand internal: " << *Operand << "\n"); 
+void CWriter::writeOperandInternal(Value *Operand,
+                                   enum OperandContext Context) {
+  // DEBUG(errs() << "In write operand internal: " << *Operand << "\n");
   if (Instruction *I = dyn_cast<Instruction>(Operand))
     // Should we inline this instruction to build a tree?
     if (isInlinableInst(*I) && !isDirectAlloca(I)) {
-      //DEBUG(errs() << "isInlinableInst & NOT isDirectAlloca\n" << "\n");
+      // DEBUG(errs() << "isInlinableInst & NOT isDirectAlloca\n" << "\n");
       Out << '(';
       writeInstComputationInline(*I);
       Out << ')';
       return;
     }
 
-  Constant* CPV = dyn_cast<Constant>(Operand);
+  Constant *CPV = dyn_cast<Constant>(Operand);
 
   if (CPV && !isa<GlobalValue>(CPV))
     printConstant(CPV, Context);
@@ -1396,12 +1523,14 @@ void CWriter::writeOperandInternal(Value *Operand, enum OperandContext Context)
     Out << GetValueName(Operand);
 }
 
-void CWriter::writeOperand(Value *Operand, enum OperandContext Context, bool arrayAccess) {
-  //DEBUG(errs() << "In write operand: " << *Operand << "; ArrayAccess = " << arrayAccess << "\n");
+void CWriter::writeOperand(Value *Operand, enum OperandContext Context,
+                           bool arrayAccess) {
+  // DEBUG(errs() << "In write operand: " << *Operand << "; ArrayAccess = " <<
+  // arrayAccess << "\n");
   bool isAddressImplicit = isAddressExposed(Operand);
   if (isAddressImplicit && !arrayAccess) {
     DEBUG(errs() << "isAddressImplicit & NOT arrayAccess!\n");
-    Out << "(&";  // Global variables are referenced as their addresses by llvm
+    Out << "(&"; // Global variables are referenced as their addresses by llvm
   }
   writeOperandInternal(Operand, Context);
 
@@ -1430,26 +1559,27 @@ void CWriter::writeOperandDeref(Value *Operand) {
 bool CWriter::writeInstructionCast(Instruction &I) {
   Type *Ty = I.getOperand(0)->getType();
   switch (I.getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::URem:
-    case Instruction::UDiv:
-      Out << "((";
-      printSimpleType(Out, Ty, false);
-      Out << ")(";
-      return true;
-    case Instruction::AShr:
-    case Instruction::SRem:
-    case Instruction::SDiv:
-      Out << "((";
-      printSimpleType(Out, Ty, true);
-      Out << ")(";
-      return true;
-    default: break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem:
+  case Instruction::UDiv:
+    Out << "((";
+    printSimpleType(Out, Ty, false);
+    Out << ")(";
+    return true;
+  case Instruction::AShr:
+  case Instruction::SRem:
+  case Instruction::SDiv:
+    Out << "((";
+    printSimpleType(Out, Ty, true);
+    Out << ")(";
+    return true;
+  default:
+    break;
   }
   return false;
 }
@@ -1457,7 +1587,8 @@ bool CWriter::writeInstructionCast(Instruction &I) {
 // Write the operand with a cast to another type based on the Opcode being used.
 // This will be used in cases where an instruction has specific type
 // requirements (usually signedness) for its operands.
-void CWriter::opcodeNeedsCast(unsigned Opcode,
+void CWriter::opcodeNeedsCast(
+    unsigned Opcode,
     // Indicate whether to do the cast or not.
     bool &shouldCast,
     // Indicate whether the cast should be to a signed type or not.
@@ -1467,33 +1598,33 @@ void CWriter::opcodeNeedsCast(unsigned Opcode,
   // the new type to which the operand should be casted by setting the value
   // of OpTy. If we change OpTy, also set shouldCast to true.
   switch (Opcode) {
-    default:
-      // for most instructions, it doesn't matter
-      shouldCast = false;
-      castIsSigned = false;
-      break;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Mul:
-      // We need to cast integer arithmetic so that it is always performed
-      // as unsigned, to avoid undefined behavior on overflow.
-    case Instruction::LShr:
-    case Instruction::UDiv:
-    case Instruction::URem: // Cast to unsigned first
-      shouldCast = true;
-      castIsSigned = false;
-      break;
-    case Instruction::GetElementPtr:
-    case Instruction::AShr:
-    case Instruction::SDiv:
-    case Instruction::SRem: // Cast to signed first
-      shouldCast = true;
-      castIsSigned = true;
-      break;
+  default:
+    // for most instructions, it doesn't matter
+    shouldCast = false;
+    castIsSigned = false;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::UDiv:
+  case Instruction::URem: // Cast to unsigned first
+    shouldCast = true;
+    castIsSigned = false;
+    break;
+  case Instruction::GetElementPtr:
+  case Instruction::AShr:
+  case Instruction::SDiv:
+  case Instruction::SRem: // Cast to signed first
+    shouldCast = true;
+    castIsSigned = true;
+    break;
   }
 }
 
-void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
+void CWriter::writeOperandWithCast(Value *Operand, unsigned Opcode) {
   //  DEBUG(errs() << "Here: " << *Operand << "\n");
   // Write out the casted operand if we should, otherwise just write the
   // operand.
@@ -1511,12 +1642,12 @@ void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
   //    writeOperand(Operand, ContextCasted);
   //    Out << ")";
   //  } else
-  writeOperand(Operand, ContextNormal/*ContextCasted*/);
+  writeOperand(Operand, ContextNormal /*ContextCasted*/);
 }
 
 // Write the operand with a cast to another type based on the icmp predicate
 // being used.
-void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) {
+void CWriter::writeOperandWithCast(Value *Operand, ICmpInst &Cmp) {
   // This has to do a cast to ensure the operand has the right signedness.
   // Also, if the operand is a pointer, we make sure to cast to an integer when
   // doing the comparison both for signedness and so that the C compiler doesn't
@@ -1535,7 +1666,7 @@ void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) {
   bool castIsSigned = Cmp.isSigned();
 
   // If the operand was a pointer, convert to a large integer type.
-  Type* OpTy = Operand->getType();
+  Type *OpTy = Operand->getType();
   if (OpTy->isPointerTy())
     OpTy = TD->getIntPtrType(Operand->getContext());
 
@@ -1549,61 +1680,64 @@ void CWriter::writeOperandWithCast(Value* Operand, ICmpInst &Cmp) {
 // generateCompilerSpecificCode - This is where we add conditional compilation
 // directives to cater to specific compilers as need be.
 //
-static void generateCompilerSpecificCode(raw_ostream& Out,
-    const DataLayout *TD) {
+static void generateCompilerSpecificCode(raw_ostream &Out,
+                                         const DataLayout *TD) {
   // Alloca is hard to get, and we don't want to include stdlib.h here.
   Out << "/* get a declaration for alloca */\n"
-    << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
-    << "#define  alloca(x) __builtin_alloca((x))\n"
-    << "#define _alloca(x) __builtin_alloca((x))\n"
-    << "#elif defined(__APPLE__)\n"
-    << "extern void *__builtin_alloca(unsigned long);\n"
-    << "#define alloca(x) __builtin_alloca(x)\n"
-    << "#define longjmp _longjmp\n"
-    << "#define setjmp _setjmp\n"
-    << "#elif defined(__sun__)\n"
-    << "#if defined(__sparcv9)\n"
-    << "extern void *__builtin_alloca(unsigned long);\n"
-    << "#else\n"
-    << "extern void *__builtin_alloca(unsigned int);\n"
-    << "#endif\n"
-    << "#define alloca(x) __builtin_alloca(x)\n"
-    << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n"
-    << "#define alloca(x) __builtin_alloca(x)\n"
-    << "#elif defined(_MSC_VER)\n"
-    << "#define alloca(x) _alloca(x)\n"
-    << "#else\n"
-    << "#include <alloca.h>\n"
-    << "#endif\n\n";
+      << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
+      << "#define  alloca(x) __builtin_alloca((x))\n"
+      << "#define _alloca(x) __builtin_alloca((x))\n"
+      << "#elif defined(__APPLE__)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#define longjmp _longjmp\n"
+      << "#define setjmp _setjmp\n"
+      << "#elif defined(__sun__)\n"
+      << "#if defined(__sparcv9)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#else\n"
+      << "extern void *__builtin_alloca(unsigned int);\n"
+      << "#endif\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || "
+         "defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(_MSC_VER)\n"
+      << "#define alloca(x) _alloca(x)\n"
+      << "#else\n"
+      << "#include <alloca.h>\n"
+      << "#endif\n\n";
 
   // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
   Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
-    << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
-    << "#elif defined(__GNUC__)\n"
-    << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
-    << "#else\n"
-    << "#define __EXTERNAL_WEAK__\n"
-    << "#endif\n\n";
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __EXTERNAL_WEAK__\n"
+      << "#endif\n\n";
 
   // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
   Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
-    << "#define __ATTRIBUTE_WEAK__\n"
-    << "#elif defined(__GNUC__)\n"
-    << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
-    << "#else\n"
-    << "#define __ATTRIBUTE_WEAK__\n"
-    << "#endif\n\n";
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#endif\n\n";
 
   // Add hidden visibility support. FIXME: APPLE_CC?
   Out << "#if defined(__GNUC__)\n"
-    << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
-    << "#endif\n\n";
+      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
+      << "#endif\n\n";
 
   // Define unaligned-load helper macro
   Out << "#ifdef _MSC_VER\n";
-  Out << "#define __UNALIGNED_LOAD__(type, align, op) *((type __unaligned*)op)\n";
+  Out << "#define __UNALIGNED_LOAD__(type, align, op) *((type "
+         "__unaligned*)op)\n";
   Out << "#else\n";
-  Out << "#define __UNALIGNED_LOAD__(type, align, op) ((struct { type data __attribute__((packed, aligned(align))); }*)op)->data\n";
+  Out << "#define __UNALIGNED_LOAD__(type, align, op) ((struct { type data "
+         "__attribute__((packed, aligned(align))); }*)op)->data\n";
   Out << "#endif\n\n";
 
   // Define unaligned-load helper macro
@@ -1654,110 +1788,144 @@ static void generateCompilerSpecificCode(raw_ostream& Out,
   //
   // Similar to __builtin_inf, except the return type is float.
   Out << "#ifdef __GNUC__\n"
-    << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
-    << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
-    //<< "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
-    //<< "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
-    << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
-    << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
-    << "#define LLVM_PREFETCH(addr,rw,locality) "
-    "__builtin_prefetch(addr,rw,locality)\n"
-    << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
-    << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
-    << "#else\n"
-    << "#define LLVM_NAN(NanStr)   ((double)NAN)           /* Double */\n"
-    << "#define LLVM_NANF(NanStr)  ((float)NAN))           /* Float */\n"
-    //<< "#define LLVM_NANS(NanStr)  ((double)NAN)           /* Double */\n"
-    //<< "#define LLVM_NANSF(NanStr) ((single)NAN)           /* Float */\n"
-    << "#define LLVM_INF           ((double)INFINITY)      /* Double */\n"
-    << "#define LLVM_INFF          ((float)INFINITY)       /* Float */\n"
-    << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
-    << "#define __ATTRIBUTE_CTOR__ \"__attribute__((constructor)) not supported on this compiler\"\n"
-    << "#define __ATTRIBUTE_DTOR__ \"__attribute__((destructor)) not supported on this compiler\"\n"
-    << "#endif\n\n";
-
-  Out << "#if !defined(__GNUC__) || __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n"
-    << "#define __builtin_stack_save() 0   /* not implemented */\n"
-    << "#define __builtin_stack_restore(X) /* noop */\n"
-    << "#endif\n\n";
+      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
+      //<< "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
+      //<< "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
+      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
+      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality) "
+         "__builtin_prefetch(addr,rw,locality)\n"
+      << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
+      << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
+      << "#else\n"
+      << "#define LLVM_NAN(NanStr)   ((double)NAN)           /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  ((float)NAN))           /* Float */\n"
+      //<< "#define LLVM_NANS(NanStr)  ((double)NAN)           /* Double */\n"
+      //<< "#define LLVM_NANSF(NanStr) ((single)NAN)           /* Float */\n"
+      << "#define LLVM_INF           ((double)INFINITY)      /* Double */\n"
+      << "#define LLVM_INFF          ((float)INFINITY)       /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
+      << "#define __ATTRIBUTE_CTOR__ \"__attribute__((constructor)) not "
+         "supported on this compiler\"\n"
+      << "#define __ATTRIBUTE_DTOR__ \"__attribute__((destructor)) not "
+         "supported on this compiler\"\n"
+      << "#endif\n\n";
+
+  Out << "#if !defined(__GNUC__) || __GNUC__ < 4 /* Old GCC's, or compilers "
+         "not GCC */ \n"
+      << "#define __builtin_stack_save() 0   /* not implemented */\n"
+      << "#define __builtin_stack_restore(X) /* noop */\n"
+      << "#endif\n\n";
 
   // Output typedefs for 128-bit integers
-  Out << "#if defined(__GNUC__) && defined(__LP64__) /* 128-bit integer types */\n"
-    << "typedef int __attribute__((mode(TI))) int128_t;\n"
-    << "typedef unsigned __attribute__((mode(TI))) uint128_t;\n"
-    << "#define UINT128_C(hi, lo) (((uint128_t)(hi) << 64) | (uint128_t)(lo))\n"
-    << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
-    << " return UINT128_C(hi, lo); }\n"
-    << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t r) {"
-    << " return l == r; }\n"
-    << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t r) {"
-    << " return l != r; }\n"
-    << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t r) {"
-    << " return l <= r; }\n"
-    << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) {"
-    << " return l <= r; }\n"
-    << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t r) {"
-    << " return l >= r; }\n"
-    << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) {"
-    << " return l >= r; }\n"
-    << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t r) {"
-    << " return l < r; }\n"
-    << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) {"
-    << " return l < r; }\n"
-    << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t r) {"
-    << " return l > r; }\n"
-    << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) {"
-    << " return l > r; }\n"
-
-    << "#else /* manual 128-bit types */\n"
-    // TODO: field order should be reversed for big-endian
-    << "typedef struct { ulong lo; ulong hi; } uint128_t;\n"
-    << "typedef uint128_t int128_t;\n"
-    << "#define UINT128_C(hi, lo) {(lo), (hi)}\n" // only use in Static context
-    << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
-    << " uint128_t r; r.lo = lo; r.hi = hi; return r; }\n"
-    << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi == r.hi && l.lo == r.lo; }\n"
-    << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi != r.hi || l.lo != r.lo; }\n"
-    << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo <= l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo <= (long)l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo >= l.hi : 0); }\n"
-    << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo >= (long)l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo < l.hi : 0); }\n"
-    << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo < (long)l.lo : 0); }\n"
-    << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t r) {"
-    << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo > l.hi : 0); }\n"
-    << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) {"
-    << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo > (long)l.lo : 0); }\n"
-    << "#define __emulate_i128\n"
-    << "#endif\n\n";
+  Out << "#if defined(__GNUC__) && defined(__LP64__) /* 128-bit integer types "
+         "*/\n"
+      << "typedef int __attribute__((mode(TI))) int128_t;\n"
+      << "typedef unsigned __attribute__((mode(TI))) uint128_t;\n"
+      << "#define UINT128_C(hi, lo) (((uint128_t)(hi) << 64) | "
+         "(uint128_t)(lo))\n"
+      << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
+      << " return UINT128_C(hi, lo); }\n"
+      << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l == r; }\n"
+      << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l != r; }\n"
+      << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l <= r; }\n"
+      << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l <= r; }\n"
+      << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l >= r; }\n"
+      << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l >= r; }\n"
+      << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l < r; }\n"
+      << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l < r; }\n"
+      << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l > r; }\n"
+      << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return l > r; }\n"
+
+      << "#else /* manual 128-bit types */\n"
+      // TODO: field order should be reversed for big-endian
+      << "typedef struct { ulong lo; ulong hi; } uint128_t;\n"
+      << "typedef uint128_t int128_t;\n"
+      << "#define UINT128_C(hi, lo) {(lo), (hi)}\n" // only use in Static
+                                                    // context
+      << "static __forceinline uint128_t llvm_ctor_u128(ulong hi, ulong lo) {"
+      << " uint128_t r; r.lo = lo; r.hi = hi; return r; }\n"
+      << "static __forceinline bool llvm_icmp_eq_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi == r.hi && l.lo == r.lo; }\n"
+      << "static __forceinline bool llvm_icmp_ne_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi != r.hi || l.lo != r.lo; }\n"
+      << "static __forceinline bool llvm_icmp_ule_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo <= l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_sle_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo <= "
+         "(long)l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_uge_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo >= l.hi : 0); }\n"
+      << "static __forceinline bool llvm_icmp_sge_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo >= "
+         "(long)l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_ult_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi < r.hi ? 1 : (l.hi == r.hi ? l.lo < l.hi : 0); }\n"
+      << "static __forceinline bool llvm_icmp_slt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi < (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo < "
+         "(long)l.lo : 0); }\n"
+      << "static __forceinline bool llvm_icmp_ugt_u128(uint128_t l, uint128_t "
+         "r) {"
+      << " return l.hi > r.hi ? 1 : (l.hi == r.hi ? l.lo > l.hi : 0); }\n"
+      << "static __forceinline bool llvm_icmp_sgt_i128(int128_t l, int128_t r) "
+         "{"
+      << " return (long)l.hi > (long)r.hi ? 1 : (l.hi == r.hi ? (long)l.lo > "
+         "(long)l.lo : 0); }\n"
+      << "#define __emulate_i128\n"
+      << "#endif\n\n";
 
   // We output GCC specific attributes to preserve 'linkonce'ness on globals.
   // If we aren't being compiled with GCC, just drop these attributes.
   Out << "#ifdef _MSC_VER  /* Can only support \"linkonce\" vars with GCC */\n"
-    << "#define __attribute__(X)\n"
-    << "#endif\n\n";
+      << "#define __attribute__(X)\n"
+      << "#endif\n\n";
 }
 
 /// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
 /// the StaticTors set.
-static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
+static void FindStaticTors(GlobalVariable *GV,
+                           std::set<Function *> &StaticTors) {
   ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
-  if (!InitList) return;
+  if (!InitList)
+    return;
 
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
-    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
-      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
+    if (ConstantStruct *CS =
+            dyn_cast<ConstantStruct>(InitList->getOperand(i))) {
+      if (CS->getNumOperands() != 2)
+        return; // Not array of 2-element structs.
 
       if (CS->getOperand(1)->isNullValue())
-        return;  // Found a null terminator, exit printing.
+        return; // Found a null terminator, exit printing.
       Constant *FP = CS->getOperand(1);
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
         if (CE->isCast())
@@ -1769,7 +1937,8 @@ static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
 
 enum SpecialGlobalClass {
   NotSpecial = 0,
-  GlobalCtors, GlobalDtors,
+  GlobalCtors,
+  GlobalDtors,
   NotPrinted
 };
 
@@ -1786,9 +1955,8 @@ static SpecialGlobalClass getGlobalVariableClass(GlobalVariable *GV) {
 
   // Otherwise, if it is other metadata, don't print it.  This catches things
   // like debug information.
-  if (StringRef(GV->getSection()) == "llvm.metadata")
-  {
-    //DEBUG(errs() << "Printing Metada!\n" << *GV << "\n");
+  if (StringRef(GV->getSection()) == "llvm.metadata") {
+    // DEBUG(errs() << "Printing Metada!\n" << *GV << "\n");
     return NotPrinted;
   }
   return NotSpecial;
@@ -1797,7 +1965,7 @@ static SpecialGlobalClass getGlobalVariableClass(GlobalVariable *GV) {
 // PrintEscapedString - Print each character of the specified string, escaping
 // it if it is not printable or if it is an escape char.
 static void PrintEscapedString(const char *Str, unsigned Length,
-    raw_ostream &Out) {
+                               raw_ostream &Out) {
   for (unsigned i = 0; i != Length; ++i) {
     unsigned char C = Str[i];
     if (isprint(C) && C != '\\' && C != '"')
@@ -1824,9 +1992,10 @@ bool CWriter::doInitialization(Module &M) {
 
   TD = new DataLayout(&M);
   IL = new IntrinsicLowering(*TD);
-  // CHECK: Looking at lib/CodeGen/IntrinsicsLowering.cpp this func not supported
-  // This func creates defs which are created once each call is referenced anyway
-  //IL->AddPrototypes(M);
+  // CHECK: Looking at lib/CodeGen/IntrinsicsLowering.cpp this func not
+  // supported This func creates defs which are created once each call is
+  // referenced anyway
+  // IL->AddPrototypes(M);
 
 #if 0
   std::string Triple = TheModule->getTargetTriple();
@@ -1838,7 +2007,7 @@ bool CWriter::doInitialization(Module &M) {
     TAsm = Match->createMCAsmInfo(Triple);
 #endif
   TAsm = new CBEMCAsmInfo();
-  MRI  = new MCRegisterInfo();
+  MRI = new MCRegisterInfo();
   TCtx = new MCContext(TAsm, MRI, NULL);
   return false;
 }
@@ -1884,17 +2053,18 @@ bool CWriter::doFinalization(Module &M) {
 void CWriter::generateHeader(Module &M) {
   // Keep track of which functions are static ctors/dtors so they can have
   // an attribute added to their prototypes.
-  std::set<Function*> StaticCtors, StaticDtors;
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-      I != E; ++I) {
+  std::set<Function *> StaticCtors, StaticDtors;
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
+       ++I) {
     switch (getGlobalVariableClass(&*I)) {
-      default: break;
-      case GlobalCtors:
-               FindStaticTors(&*I, StaticCtors);
-               break;
-      case GlobalDtors:
-               FindStaticTors(&*I, StaticDtors);
-               break;
+    default:
+      break;
+    case GlobalCtors:
+      FindStaticTors(&*I, StaticCtors);
+      break;
+    case GlobalDtors:
+      FindStaticTors(&*I, StaticDtors);
+      break;
     }
   }
 
@@ -1904,8 +2074,9 @@ void CWriter::generateHeader(Module &M) {
   //  Out << "#include <setjmp.h>\n";      // Unwind support
   //  Out << "#include <limits.h>\n";      // With overflow intrinsics support.
   //  Out << "#include <stdint.h>\n";      // Sized integer support
-  //  Out << "#include <math.h>\n";        // definitions for some math functions and numeric constants
-  //  Out << "#include <APInt-C.h>\n";     // Implementations of many llvm intrinsics
+  //  Out << "#include <math.h>\n";        // definitions for some math
+  //  functions and numeric constants Out << "#include <APInt-C.h>\n";     //
+  //  Implementations of many llvm intrinsics
   //  // Provide a definition for `bool' if not compiling with a C++ compiler.
   //  Out << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n";
   //  Out << "\n";
@@ -1913,24 +2084,24 @@ void CWriter::generateHeader(Module &M) {
   //  generateCompilerSpecificCode(Out, TD);
 
   Out << "\n\n/* Support for floating point constants */\n"
-    << "typedef ulong ConstantDoubleTy;\n"
-    << "typedef uint ConstantFloatTy;\n"
-    << "typedef struct { ulong f1; ushort f2; "
-    "ushort pad[3]; } ConstantFP80Ty;\n"
-    // This is used for both kinds of 128-bit long double; meaning differs.
-    << "typedef struct { ulong f1; ulong f2; }"
-    " ConstantFP128Ty;\n"
-    << "\n\n/* OpenCL Pragmas */\n"
-    << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-    << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
-    << "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
-    << "\n\n/* Global Declarations */\n";
+      << "typedef ulong ConstantDoubleTy;\n"
+      << "typedef uint ConstantFloatTy;\n"
+      << "typedef struct { ulong f1; ushort f2; "
+         "ushort pad[3]; } ConstantFP80Ty;\n"
+      // This is used for both kinds of 128-bit long double; meaning differs.
+      << "typedef struct { ulong f1; ulong f2; }"
+         " ConstantFP128Ty;\n"
+      << "\n\n/* OpenCL Pragmas */\n"
+      << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+      << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
+      << "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
+      << "\n\n/* Global Declarations */\n";
 
   // First output all the declarations for the program, because C requires
   // Functions & globals to be declared before they are used.
   if (!M.getModuleInlineAsm().empty()) {
     Out << "\n/* Module asm statements */\n"
-      << "__asm__ (";
+        << "__asm__ (";
 
     // Split the string into lines, to make it easier to read the .ll file.
     std::string Asm = M.getModuleInlineAsm();
@@ -1940,22 +2111,22 @@ void CWriter::generateHeader(Module &M) {
       // We found a newline, print the portion of the asm string from the
       // last newline up to this newline.
       Out << "\"";
-      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
-          Out);
+      PrintEscapedString(
+          std::string(Asm.begin() + CurPos, Asm.begin() + NewLine), Out);
       Out << "\\n\"\n";
-      CurPos = NewLine+1;
+      CurPos = NewLine + 1;
       NewLine = Asm.find_first_of('\n', CurPos);
     }
     Out << "\"";
-    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
+    PrintEscapedString(std::string(Asm.begin() + CurPos, Asm.end()), Out);
     Out << "\");\n"
-      << "/* End Module asm statements */\n";
+        << "/* End Module asm statements */\n";
   }
 
   // collect any remaining types
   raw_null_ostream NullOut;
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-      I != E; ++I) {
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
+       ++I) {
     // Ignore special globals, such as debug info.
     if (getGlobalVariableClass(&*I))
       continue;
@@ -1967,8 +2138,9 @@ void CWriter::generateHeader(Module &M) {
   if (!M.global_empty()) {
     Out << "\n/* External Global Variable Declarations */\n";
     for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-        I != E; ++I) {
-      if (!I->isDeclaration() || isEmptyType(I->getType()->getPointerElementType()))
+         I != E; ++I) {
+      if (!I->isDeclaration() ||
+          isEmptyType(I->getType()->getPointerElementType()))
         continue;
 
       if (I->hasDLLImportStorageClass())
@@ -1988,8 +2160,8 @@ void CWriter::generateHeader(Module &M) {
 
       Type *ElTy = I->getType()->getElementType();
       unsigned Alignment = I->getAlignment();
-      bool IsOveraligned = Alignment &&
-        Alignment > TD->getABITypeAlignment(ElTy);
+      bool IsOveraligned =
+          Alignment && Alignment > TD->getABITypeAlignment(ElTy);
       //      if (IsOveraligned)
       //        Out << "__MSALIGN__(" << Alignment << ") ";
       printTypeName(Out, ElTy, false) << ' ' << GetValueName(&*I);
@@ -2006,64 +2178,53 @@ void CWriter::generateHeader(Module &M) {
   Out << "\n/* Function Declarations */\n";
 
   // Store the intrinsics which will be declared/defined below.
-  SmallVector<Function*, 16> intrinsicsToDefine;
+  SmallVector<Function *, 16> intrinsicsToDefine;
 
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     // Don't print declarations for intrinsic functions.
     // Store the used intrinsics, which need to be explicitly defined.
     if (I->isIntrinsic()) {
       switch (I->getIntrinsicID()) {
-        default:
-          continue;
-        case Intrinsic::uadd_with_overflow:
-        case Intrinsic::sadd_with_overflow:
-        case Intrinsic::usub_with_overflow:
-        case Intrinsic::ssub_with_overflow:
-        case Intrinsic::umul_with_overflow:
-        case Intrinsic::smul_with_overflow:
-        case Intrinsic::bswap:
-        case Intrinsic::ceil:
-        case Intrinsic::ctlz:
-        case Intrinsic::ctpop:
-        case Intrinsic::cttz:
-        case Intrinsic::fabs:
-        case Intrinsic::floor:
-        case Intrinsic::fma:
-        case Intrinsic::fmuladd:
-        case Intrinsic::pow:
-        case Intrinsic::powi:
-        case Intrinsic::rint:
-        case Intrinsic::sqrt:
-        case Intrinsic::trunc:
-          intrinsicsToDefine.push_back(&*I);
-          continue;
+      default:
+        continue;
+      case Intrinsic::uadd_with_overflow:
+      case Intrinsic::sadd_with_overflow:
+      case Intrinsic::usub_with_overflow:
+      case Intrinsic::ssub_with_overflow:
+      case Intrinsic::umul_with_overflow:
+      case Intrinsic::smul_with_overflow:
+      case Intrinsic::bswap:
+      case Intrinsic::ceil:
+      case Intrinsic::ctlz:
+      case Intrinsic::ctpop:
+      case Intrinsic::cttz:
+      case Intrinsic::fabs:
+      case Intrinsic::floor:
+      case Intrinsic::fma:
+      case Intrinsic::fmuladd:
+      case Intrinsic::pow:
+      case Intrinsic::powi:
+      case Intrinsic::rint:
+      case Intrinsic::sqrt:
+      case Intrinsic::trunc:
+        intrinsicsToDefine.push_back(&*I);
+        continue;
       }
     }
 
     // Skip a few functions that have already been defined in headers
-    if (I->getName() == "setjmp" ||
-        I->getName() == "longjmp" ||
-        I->getName() == "_setjmp" ||
-        I->getName() == "siglongjmp" ||
-        I->getName() == "sigsetjmp" ||
-        I->getName() == "pow" ||
-        I->getName() == "powf" ||
-        I->getName() == "sqrt" ||
-        I->getName() == "sqrtf" ||
-        I->getName() == "trunc" ||
-        I->getName() == "truncf" ||
-        I->getName() == "rint" ||
-        I->getName() == "rintf" ||
-        I->getName() == "floor" ||
-        I->getName() == "floorf" ||
-        I->getName() == "ceil" ||
-        I->getName() == "ceilf" ||
-        I->getName() == "alloca" ||
-        I->getName() == "_alloca" ||
-        I->getName() == "_chkstk" ||
-        I->getName() == "__chkstk" ||
-        I->getName() == "___chkstk_ms")
-        continue;
+    if (I->getName() == "setjmp" || I->getName() == "longjmp" ||
+        I->getName() == "_setjmp" || I->getName() == "siglongjmp" ||
+        I->getName() == "sigsetjmp" || I->getName() == "pow" ||
+        I->getName() == "powf" || I->getName() == "sqrt" ||
+        I->getName() == "sqrtf" || I->getName() == "trunc" ||
+        I->getName() == "truncf" || I->getName() == "rint" ||
+        I->getName() == "rintf" || I->getName() == "floor" ||
+        I->getName() == "floorf" || I->getName() == "ceil" ||
+        I->getName() == "ceilf" || I->getName() == "alloca" ||
+        I->getName() == "_alloca" || I->getName() == "_chkstk" ||
+        I->getName() == "__chkstk" || I->getName() == "___chkstk_ms")
+      continue;
 
     if (I->hasDLLImportStorageClass())
       Out << "__declspec(dllimport) ";
@@ -2096,7 +2257,7 @@ void CWriter::generateHeader(Module &M) {
   if (!M.global_empty()) {
     Out << "\n\n/* Global Variable Definitions and Initialization */\n";
     for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-        I != E; ++I) {
+         I != E; ++I) {
       declareOneGlobalVariable(&*I);
     }
   }
@@ -2104,9 +2265,10 @@ void CWriter::generateHeader(Module &M) {
   // Alias declarations...
   if (!M.alias_empty()) {
     Out << "\n/* External Alias Declarations */\n";
-    for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-        I != E; ++I) {
-      assert(!I->isDeclaration() && !isEmptyType(I->getType()->getPointerElementType()));
+    for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); I != E;
+         ++I) {
+      assert(!I->isDeclaration() &&
+             !isEmptyType(I->getType()->getPointerElementType()));
       if (I->hasLocalLinkage())
         continue; // Internal Global
 
@@ -2121,8 +2283,8 @@ void CWriter::generateHeader(Module &M) {
 
       Type *ElTy = I->getType()->getElementType();
       unsigned Alignment = I->getAlignment();
-      bool IsOveraligned = Alignment &&
-        Alignment > TD->getABITypeAlignment(ElTy);
+      bool IsOveraligned =
+          Alignment && Alignment > TD->getABITypeAlignment(ElTy);
       //      if (IsOveraligned)
       //        Out << "__MSALIGN__(" << Alignment << ") ";
       // GetValueName would resolve the alias, which is not what we want,
@@ -2177,9 +2339,11 @@ void CWriter::generateHeader(Module &M) {
   Out << "return 1; }\n";
 
   // Loop over all select operations
-  for (std::set<Type*>::iterator it = SelectDeclTypes.begin(), end = SelectDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline Rty llvm_select_u8x4(<bool x 4> condition, <u8 x 4> iftrue, <u8 x 4> ifnot) {
+  for (std::set<Type *>::iterator it = SelectDeclTypes.begin(),
+                                  end = SelectDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline Rty llvm_select_u8x4(<bool x 4> condition, <u8 x 4>
+    // iftrue, <u8 x 4> ifnot) {
     //   Rty r = {
     //     condition[0] ? iftrue[0] : ifnot[0],
     //     condition[1] ? iftrue[1] : ifnot[1],
@@ -2194,7 +2358,11 @@ void CWriter::generateHeader(Module &M) {
     printTypeString(Out, *it, false);
     Out << "(";
     if (isa<VectorType>(*it))
-      printTypeNameUnaligned(Out, VectorType::get(Type::getInt1Ty((*it)->getContext()), (*it)->getVectorNumElements()), false);
+      printTypeNameUnaligned(
+          Out,
+          VectorType::get(Type::getInt1Ty((*it)->getContext()),
+                          (*it)->getVectorNumElements()),
+          false);
     else
       Out << "bool";
     Out << " condition, ";
@@ -2207,19 +2375,22 @@ void CWriter::generateHeader(Module &M) {
     if (isa<VectorType>(*it)) {
       unsigned n, l = (*it)->getVectorNumElements();
       for (n = 0; n < l; n++) {
-        Out << "  r.vector[" << n << "] = condition.vector[" << n << "] ? iftrue.vector[" << n << "] : ifnot.vector[" << n << "];\n";
+        Out << "  r.vector[" << n << "] = condition.vector[" << n
+            << "] ? iftrue.vector[" << n << "] : ifnot.vector[" << n << "];\n";
       }
-    }
-    else {
+    } else {
       Out << "  r = condition ? iftrue : ifnot;\n";
     }
     Out << "  return r;\n}\n";
   }
 
   // Loop over all compare operations
-  for (std::set< std::pair<CmpInst::Predicate, VectorType*> >::iterator it = CmpDeclTypes.begin(), end = CmpDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <bool x 4> llvm_icmp_ge_u8x4(<u8 x 4> l, <u8 x 4> r) {
+  for (std::set<std::pair<CmpInst::Predicate, VectorType *>>::iterator
+           it = CmpDeclTypes.begin(),
+           end = CmpDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <bool x 4> llvm_icmp_ge_u8x4(<u8 x 4> l, <u8 x 4> r)
+    // {
     //   Rty c = {
     //     l[0] >= r[0],
     //     l[1] >= r[1],
@@ -2229,7 +2400,8 @@ void CWriter::generateHeader(Module &M) {
     //   return c;
     // }
     unsigned n, l = (*it).second->getVectorNumElements();
-    VectorType *RTy = VectorType::get(Type::getInt1Ty((*it).second->getContext()), l);
+    VectorType *RTy =
+        VectorType::get(Type::getInt1Ty((*it).second->getContext()), l);
     bool isSigned = CmpInst::isSigned((*it).first);
     Out << "static __forceinline ";
     printTypeName(Out, RTy, isSigned);
@@ -2249,25 +2421,38 @@ void CWriter::generateHeader(Module &M) {
     for (n = 0; n < l; n++) {
       Out << "  c.vector[" << n << "] = ";
       if (CmpInst::isFPPredicate((*it).first)) {
-        Out << "llvm_fcmp_ " << getCmpPredicateName((*it).first) << "(l.vector[" << n << "], r.vector[" << n << "]);\n";
+        Out << "llvm_fcmp_ " << getCmpPredicateName((*it).first) << "(l.vector["
+            << n << "], r.vector[" << n << "]);\n";
       } else {
         Out << "l.vector[" << n << "]";
         switch ((*it).first) {
-          case CmpInst::ICMP_EQ:  Out << " == "; break;
-          case CmpInst::ICMP_NE:  Out << " != "; break;
-          case CmpInst::ICMP_ULE:
-          case CmpInst::ICMP_SLE: Out << " <= "; break;
-          case CmpInst::ICMP_UGE:
-          case CmpInst::ICMP_SGE: Out << " >= "; break;
-          case CmpInst::ICMP_ULT:
-          case CmpInst::ICMP_SLT: Out << " < "; break;
-          case CmpInst::ICMP_UGT:
-          case CmpInst::ICMP_SGT: Out << " > "; break;
-          default:
+        case CmpInst::ICMP_EQ:
+          Out << " == ";
+          break;
+        case CmpInst::ICMP_NE:
+          Out << " != ";
+          break;
+        case CmpInst::ICMP_ULE:
+        case CmpInst::ICMP_SLE:
+          Out << " <= ";
+          break;
+        case CmpInst::ICMP_UGE:
+        case CmpInst::ICMP_SGE:
+          Out << " >= ";
+          break;
+        case CmpInst::ICMP_ULT:
+        case CmpInst::ICMP_SLT:
+          Out << " < ";
+          break;
+        case CmpInst::ICMP_UGT:
+        case CmpInst::ICMP_SGT:
+          Out << " > ";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid icmp predicate!" << (*it).first;
+          errs() << "Invalid icmp predicate!" << (*it).first;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "r.vector[" << n << "];\n";
       }
@@ -2276,9 +2461,13 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Loop over all (vector) cast operations
-  for (std::set<std::pair<CastInst::CastOps, std::pair<Type*, Type*>>>::iterator it = CastOpDeclTypes.begin(), end = CastOpDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <u32 x 4> llvm_ZExt_u8x4_u32x4(<u8 x 4> in) { // Src->isVector == Dst->isVector
+  for (std::set<
+           std::pair<CastInst::CastOps, std::pair<Type *, Type *>>>::iterator
+           it = CastOpDeclTypes.begin(),
+           end = CastOpDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <u32 x 4> llvm_ZExt_u8x4_u32x4(<u8 x 4> in) { //
+    // Src->isVector == Dst->isVector
     //   Rty out = {
     //     in[0],
     //     in[1],
@@ -2287,7 +2476,8 @@ void CWriter::generateHeader(Module &M) {
     //   };
     //   return out;
     // }
-    // static __forceinline u32 llvm_BitCast_u8x4_u32(<u8 x 4> in) { // Src->bitsSize == Dst->bitsSize
+    // static __forceinline u32 llvm_BitCast_u8x4_u32(<u8 x 4> in) { //
+    // Src->bitsSize == Dst->bitsSize
     //   union {
     //     <u8 x 4> in;
     //     u32 out;
@@ -2300,18 +2490,18 @@ void CWriter::generateHeader(Module &M) {
     Type *DstTy = (*it).second.second;
     bool SrcSigned, DstSigned;
     switch (opcode) {
-      default:
-        SrcSigned = false;
-        DstSigned = false;
-      case Instruction::SIToFP:
-        SrcSigned = true;
-        DstSigned = false;
-      case Instruction::FPToSI:
-        SrcSigned = false;
-        DstSigned = true;
-      case Instruction::SExt:
-        SrcSigned = true;
-        DstSigned = true;
+    default:
+      SrcSigned = false;
+      DstSigned = false;
+    case Instruction::SIToFP:
+      SrcSigned = true;
+      DstSigned = false;
+    case Instruction::FPToSI:
+      SrcSigned = false;
+      DstSigned = true;
+    case Instruction::SExt:
+      SrcSigned = true;
+      DstSigned = true;
     }
 
     Out << "static __forceinline ";
@@ -2350,20 +2540,34 @@ void CWriter::generateHeader(Module &M) {
       Out << " out;\n";
       Out << "  LLVM";
       switch (opcode) {
-        case Instruction::UIToFP: Out << "UItoFP"; break;
-        case Instruction::SIToFP: Out << "SItoFP"; break;
-        case Instruction::Trunc: Out << "Trunc"; break;
-                                 //case Instruction::FPExt:
-                                 //case Instruction::FPTrunc:
-        case Instruction::ZExt: Out << "ZExt"; break;
-        case Instruction::FPToUI: Out << "FPtoUI"; break;
-        case Instruction::SExt: Out << "SExt"; break;
-        case Instruction::FPToSI: Out << "FPtoSI"; break;
-        default:
-                                  llvm_unreachable("Invalid cast opcode for i128");
+      case Instruction::UIToFP:
+        Out << "UItoFP";
+        break;
+      case Instruction::SIToFP:
+        Out << "SItoFP";
+        break;
+      case Instruction::Trunc:
+        Out << "Trunc";
+        break;
+        // case Instruction::FPExt:
+        // case Instruction::FPTrunc:
+      case Instruction::ZExt:
+        Out << "ZExt";
+        break;
+      case Instruction::FPToUI:
+        Out << "FPtoUI";
+        break;
+      case Instruction::SExt:
+        Out << "SExt";
+        break;
+      case Instruction::FPToSI:
+        Out << "FPtoSI";
+        break;
+      default:
+        llvm_unreachable("Invalid cast opcode for i128");
       }
       Out << "(" << SrcTy->getPrimitiveSizeInBits() << ", &in, "
-        << DstTy->getPrimitiveSizeInBits() << ", &out);\n";
+          << DstTy->getPrimitiveSizeInBits() << ", &out);\n";
       Out << "  return out;\n";
       Out << "#endif\n";
       Out << "}\n";
@@ -2371,9 +2575,12 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Loop over all simple vector operations
-  for (std::set<std::pair<unsigned, Type*>>::iterator it = InlineOpDeclTypes.begin(), end = InlineOpDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <u32 x 4> llvm_BinOp_u32x4(<u32 x 4> a, <u32 x 4> b) {
+  for (std::set<std::pair<unsigned, Type *>>::iterator
+           it = InlineOpDeclTypes.begin(),
+           end = InlineOpDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <u32 x 4> llvm_BinOp_u32x4(<u32 x 4> a, <u32 x 4> b)
+    // {
     //   Rty r = {
     //      a[0] OP b[0],
     //      a[1] OP b[1],
@@ -2417,7 +2624,7 @@ void CWriter::generateHeader(Module &M) {
     // C can't handle non-power-of-two integer types
     unsigned mask = 0;
     if (ElemTy->isIntegerTy()) {
-      IntegerType *ITy = static_cast<IntegerType*>(ElemTy);
+      IntegerType *ITy = static_cast<IntegerType *>(ElemTy);
       if (!ITy->isPowerOf2ByteWidth())
         mask = ITy->getBitMask();
     }
@@ -2439,34 +2646,54 @@ void CWriter::generateHeader(Module &M) {
             Out << "fmodf(a.vector[" << n << "], b.vector[" << n << "])";
           else if (ElemTy->isDoubleTy())
             Out << "fmod(a.vector[" << n << "], b.vector[" << n << "])";
-          else  // all 3 flavors of long double
+          else // all 3 flavors of long double
             Out << "fmodl(a.vector[" << n << "], b.vector[" << n << "])";
         } else {
           Out << "a.vector[" << n << "]";
           switch (opcode) {
-            case Instruction::Add:
-            case Instruction::FAdd: Out << " + "; break;
-            case Instruction::Sub:
-            case Instruction::FSub: Out << " - "; break;
-            case Instruction::Mul:
-            case Instruction::FMul: Out << " * "; break;
-            case Instruction::URem:
-            case Instruction::SRem:
-            case Instruction::FRem: Out << " % "; break;
-            case Instruction::UDiv:
-            case Instruction::SDiv:
-            case Instruction::FDiv: Out << " / "; break;
-            case Instruction::And:  Out << " & "; break;
-            case Instruction::Or:   Out << " | "; break;
-            case Instruction::Xor:  Out << " ^ "; break;
-            case Instruction::Shl : Out << " << "; break;
-            case Instruction::LShr:
-            case Instruction::AShr: Out << " >> "; break;
-            default:
+          case Instruction::Add:
+          case Instruction::FAdd:
+            Out << " + ";
+            break;
+          case Instruction::Sub:
+          case Instruction::FSub:
+            Out << " - ";
+            break;
+          case Instruction::Mul:
+          case Instruction::FMul:
+            Out << " * ";
+            break;
+          case Instruction::URem:
+          case Instruction::SRem:
+          case Instruction::FRem:
+            Out << " % ";
+            break;
+          case Instruction::UDiv:
+          case Instruction::SDiv:
+          case Instruction::FDiv:
+            Out << " / ";
+            break;
+          case Instruction::And:
+            Out << " & ";
+            break;
+          case Instruction::Or:
+            Out << " | ";
+            break;
+          case Instruction::Xor:
+            Out << " ^ ";
+            break;
+          case Instruction::Shl:
+            Out << " << ";
+            break;
+          case Instruction::LShr:
+          case Instruction::AShr:
+            Out << " >> ";
+            break;
+          default:
 #ifndef NDEBUG
-                                    errs() << "Invalid operator type!" << opcode;
+            errs() << "Invalid operator type!" << opcode;
 #endif
-                                    llvm_unreachable(0);
+            llvm_unreachable(0);
           }
           Out << "b.vector[" << n << "]";
         }
@@ -2487,24 +2714,44 @@ void CWriter::generateHeader(Module &M) {
       } else {
         Out << "a";
         switch (opcode) {
-          case Instruction::Add: Out << " + "; break;
-          case Instruction::Sub: Out << " - "; break;
-          case Instruction::Mul: Out << " * "; break;
-          case Instruction::URem:
-          case Instruction::SRem: Out << " % "; break;
-          case Instruction::UDiv:
-          case Instruction::SDiv: Out << " / "; break;
-          case Instruction::And:  Out << " & "; break;
-          case Instruction::Or:   Out << " | "; break;
-          case Instruction::Xor:  Out << " ^ "; break;
-          case Instruction::Shl:  Out << " << "; break;
-          case Instruction::LShr:
-          case Instruction::AShr: Out << " >> "; break;
-          default:
+        case Instruction::Add:
+          Out << " + ";
+          break;
+        case Instruction::Sub:
+          Out << " - ";
+          break;
+        case Instruction::Mul:
+          Out << " * ";
+          break;
+        case Instruction::URem:
+        case Instruction::SRem:
+          Out << " % ";
+          break;
+        case Instruction::UDiv:
+        case Instruction::SDiv:
+          Out << " / ";
+          break;
+        case Instruction::And:
+          Out << " & ";
+          break;
+        case Instruction::Or:
+          Out << " | ";
+          break;
+        case Instruction::Xor:
+          Out << " ^ ";
+          break;
+        case Instruction::Shl:
+          Out << " << ";
+          break;
+        case Instruction::LShr:
+        case Instruction::AShr:
+          Out << " >> ";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid operator type!" << opcode;
+          errs() << "Invalid operator type!" << opcode;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "b;\n";
       }
@@ -2526,7 +2773,8 @@ void CWriter::generateHeader(Module &M) {
       } else if (opcode == Instruction::Xor) {
         Out << "  r.hi = a.hi ^ b.hi;\n";
         Out << "  r.lo = a.lo ^ b.lo;\n";
-      } else if (opcode == Instruction::Shl) { // reminder: undef behavior if b >= 128
+      } else if (opcode ==
+                 Instruction::Shl) { // reminder: undef behavior if b >= 128
         Out << "  if (b.lo >= 64) {\n";
         Out << "    r.hi = (a.lo << (b.lo - 64));\n";
         Out << "    r.lo = 0;\n";
@@ -2541,26 +2789,44 @@ void CWriter::generateHeader(Module &M) {
         // everything that hasn't been manually implemented above
         Out << "  LLVM";
         switch (opcode) {
-          //case BinaryNeg: Out << "Neg"; break;
-          //case BinaryNot: Out << "FlipAllBits"; break;
-          case Instruction::Add: Out << "Add"; break;
-          case Instruction::Sub: Out << "Sub"; break;
-          case Instruction::Mul: Out << "Mul"; break;
-          case Instruction::URem: Out << "URem"; break;
-          case Instruction::SRem: Out << "SRem"; break;
-          case Instruction::UDiv: Out << "UDiv"; break;
-          case Instruction::SDiv: Out << "SDiv"; break;
-                                  //case Instruction::And:  Out << "And"; break;
-                                  //case Instruction::Or:   Out << "Or"; break;
-                                  //case Instruction::Xor:  Out << "Xor"; break;
-                                  //case Instruction::Shl: Out << "Shl"; break;
-          case Instruction::LShr: Out << "LShr"; break;
-          case Instruction::AShr: Out << "AShr"; break;
-          default:
+        // case BinaryNeg: Out << "Neg"; break;
+        // case BinaryNot: Out << "FlipAllBits"; break;
+        case Instruction::Add:
+          Out << "Add";
+          break;
+        case Instruction::Sub:
+          Out << "Sub";
+          break;
+        case Instruction::Mul:
+          Out << "Mul";
+          break;
+        case Instruction::URem:
+          Out << "URem";
+          break;
+        case Instruction::SRem:
+          Out << "SRem";
+          break;
+        case Instruction::UDiv:
+          Out << "UDiv";
+          break;
+        case Instruction::SDiv:
+          Out << "SDiv";
+          break;
+          // case Instruction::And:  Out << "And"; break;
+          // case Instruction::Or:   Out << "Or"; break;
+          // case Instruction::Xor:  Out << "Xor"; break;
+          // case Instruction::Shl: Out << "Shl"; break;
+        case Instruction::LShr:
+          Out << "LShr";
+          break;
+        case Instruction::AShr:
+          Out << "AShr";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid operator type!" << opcode;
+          errs() << "Invalid operator type!" << opcode;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "(16, &a, &b, &r);\n";
       }
@@ -2580,34 +2846,54 @@ void CWriter::generateHeader(Module &M) {
           Out << "fmodf(a, b)";
         else if (ElemTy->isDoubleTy())
           Out << "fmod(a, b)";
-        else  // all 3 flavors of long double
+        else // all 3 flavors of long double
           Out << "fmodl(a, b)";
       } else {
         Out << "a";
         switch (opcode) {
-          case Instruction::Add:
-          case Instruction::FAdd: Out << " + "; break;
-          case Instruction::Sub:
-          case Instruction::FSub: Out << " - "; break;
-          case Instruction::Mul:
-          case Instruction::FMul: Out << " * "; break;
-          case Instruction::URem:
-          case Instruction::SRem:
-          case Instruction::FRem: Out << " % "; break;
-          case Instruction::UDiv:
-          case Instruction::SDiv:
-          case Instruction::FDiv: Out << " / "; break;
-          case Instruction::And:  Out << " & "; break;
-          case Instruction::Or:   Out << " | "; break;
-          case Instruction::Xor:  Out << " ^ "; break;
-          case Instruction::Shl : Out << " << "; break;
-          case Instruction::LShr:
-          case Instruction::AShr: Out << " >> "; break;
-          default:
+        case Instruction::Add:
+        case Instruction::FAdd:
+          Out << " + ";
+          break;
+        case Instruction::Sub:
+        case Instruction::FSub:
+          Out << " - ";
+          break;
+        case Instruction::Mul:
+        case Instruction::FMul:
+          Out << " * ";
+          break;
+        case Instruction::URem:
+        case Instruction::SRem:
+        case Instruction::FRem:
+          Out << " % ";
+          break;
+        case Instruction::UDiv:
+        case Instruction::SDiv:
+        case Instruction::FDiv:
+          Out << " / ";
+          break;
+        case Instruction::And:
+          Out << " & ";
+          break;
+        case Instruction::Or:
+          Out << " | ";
+          break;
+        case Instruction::Xor:
+          Out << " ^ ";
+          break;
+        case Instruction::Shl:
+          Out << " << ";
+          break;
+        case Instruction::LShr:
+        case Instruction::AShr:
+          Out << " >> ";
+          break;
+        default:
 #ifndef NDEBUG
-                                  errs() << "Invalid operator type!" << opcode;
+          errs() << "Invalid operator type!" << opcode;
 #endif
-                                  llvm_unreachable(0);
+          llvm_unreachable(0);
         }
         Out << "b";
         if (mask)
@@ -2619,9 +2905,11 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Loop over all inline constructors
-  for (std::set<Type*>::iterator it = CtorDeclTypes.begin(), end = CtorDeclTypes.end();
-      it != end; ++it) {
-    // static __forceinline <u32 x 4> llvm_ctor_u32x4(u32 x1, u32 x2, u32 x3, u32 x4) {
+  for (std::set<Type *>::iterator it = CtorDeclTypes.begin(),
+                                  end = CtorDeclTypes.end();
+       it != end; ++it) {
+    // static __forceinline <u32 x 4> llvm_ctor_u32x4(u32 x1, u32 x2, u32 x3,
+    // u32 x4) {
     //   Rty r = {
     //     x1, x2, x3, x4
     //   };
@@ -2635,10 +2923,12 @@ void CWriter::generateHeader(Module &M) {
     StructType *STy = dyn_cast<StructType>(*it);
     ArrayType *ATy = dyn_cast<ArrayType>(*it);
     VectorType *VTy = dyn_cast<VectorType>(*it);
-    unsigned e = (STy ? STy->getNumElements() : (ATy ? ATy->getNumElements() : VTy->getNumElements()));
+    unsigned e = (STy ? STy->getNumElements()
+                      : (ATy ? ATy->getNumElements() : VTy->getNumElements()));
     bool printed = false;
     for (unsigned i = 0; i != e; ++i) {
-      Type *ElTy = STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
+      Type *ElTy =
+          STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
       if (isEmptyType(ElTy))
         Out << " /* ";
       else if (printed)
@@ -2654,7 +2944,8 @@ void CWriter::generateHeader(Module &M) {
     printTypeName(Out, *it);
     Out << " r;";
     for (unsigned i = 0; i != e; ++i) {
-      Type *ElTy = STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
+      Type *ElTy =
+          STy ? STy->getElementType(i) : (*it)->getSequentialElementType();
       if (isEmptyType(ElTy))
         continue;
       if (STy)
@@ -2670,9 +2961,9 @@ void CWriter::generateHeader(Module &M) {
   }
 
   // Emit definitions of the intrinsics.
-  for (SmallVector<Function*, 16>::iterator
-      I = intrinsicsToDefine.begin(),
-      E = intrinsicsToDefine.end(); I != E; ++I) {
+  for (SmallVector<Function *, 16>::iterator I = intrinsicsToDefine.begin(),
+                                             E = intrinsicsToDefine.end();
+       I != E; ++I) {
     printIntrinsicDefinition(**I, Out);
   }
 
@@ -2680,7 +2971,7 @@ void CWriter::generateHeader(Module &M) {
     Out << "\n\n/* Function Bodies */\n";
 }
 
-void CWriter::declareOneGlobalVariable(GlobalVariable* I) {
+void CWriter::declareOneGlobalVariable(GlobalVariable *I) {
   if (I->isDeclaration() || isEmptyType(I->getType()->getPointerElementType()))
     return;
 
@@ -2702,8 +2993,7 @@ void CWriter::declareOneGlobalVariable(GlobalVariable* I) {
 
   Type *ElTy = I->getType()->getElementType();
   unsigned Alignment = I->getAlignment();
-  bool IsOveraligned = Alignment &&
-    Alignment > TD->getABITypeAlignment(ElTy);
+  bool IsOveraligned = Alignment && Alignment > TD->getABITypeAlignment(ElTy);
   //  if (IsOveraligned)
   //    Out << "__MSALIGN__(" << Alignment << ") ";
   printTypeName(Out, ElTy, false) << ' ' << GetValueName(I);
@@ -2727,13 +3017,13 @@ void CWriter::declareOneGlobalVariable(GlobalVariable* I) {
   // and common, so we disable this optimization.
   // FIXME common linkage should avoid this problem.
   if (!I->getInitializer()->isNullValue()) {
-    Out << " = " ;
+    Out << " = ";
     writeOperand(I->getInitializer(), ContextStatic);
   } else if (I->hasWeakLinkage()) {
     // We have to specify an initializer, but it doesn't have to be
     // complete.  If the value is an aggregate, print out { 0 }, and let
     // the compiler figure out the rest of the zeros.
-    Out << " = " ;
+    Out << " = ";
     if (I->getInitializer()->getType()->isStructTy() ||
         I->getInitializer()->getType()->isVectorTy()) {
       Out << "{ 0 }";
@@ -2757,7 +3047,8 @@ void CWriter::printFloatingPointConstants(Function &F) {
   // precision.
   //
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I)
-    for (Instruction::op_iterator I_Op = I->op_begin(), E_Op = I->op_end(); I_Op != E_Op; ++I_Op)
+    for (Instruction::op_iterator I_Op = I->op_begin(), E_Op = I->op_end();
+         I_Op != E_Op; ++I_Op)
       if (const Constant *C = dyn_cast<Constant>(I_Op))
         printFloatingPointConstants(C);
   Out << '\n';
@@ -2780,44 +3071,39 @@ void CWriter::printFloatingPointConstants(const Constant *C) {
       FPConstantMap.count(FPC))
     return;
 
-  FPConstantMap[FPC] = FPCounter;  // Number the FP constants
+  FPConstantMap[FPC] = FPCounter; // Number the FP constants
 
   if (FPC->getType() == Type::getDoubleTy(FPC->getContext())) {
     double Val = FPC->getValueAPF().convertToDouble();
     uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
-    Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
-      << " = 0x" << utohexstr(i)
-      << "ULL;    /* " << Val << " */\n";
+    Out << "static const ConstantDoubleTy FPConstant" << FPCounter++ << " = 0x"
+        << utohexstr(i) << "ULL;    /* " << Val << " */\n";
   } else if (FPC->getType() == Type::getFloatTy(FPC->getContext())) {
     float Val = FPC->getValueAPF().convertToFloat();
-    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
-      getZExtValue();
-    Out << "static const ConstantFloatTy FPConstant" << FPCounter++
-      << " = 0x" << utohexstr(i)
-      << "U;    /* " << Val << " */\n";
+    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+    Out << "static const ConstantFloatTy FPConstant" << FPCounter++ << " = 0x"
+        << utohexstr(i) << "U;    /* " << Val << " */\n";
   } else if (FPC->getType() == Type::getX86_FP80Ty(FPC->getContext())) {
     // api needed to prevent premature destruction
     const APInt api = FPC->getValueAPF().bitcastToAPInt();
     const uint64_t *p = api.getRawData();
-    Out << "static const ConstantFP80Ty FPConstant" << FPCounter++
-      << " = { 0x" << utohexstr(p[0])
-      << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}"
-      << "}; /* Long double constant */\n";
+    Out << "static const ConstantFP80Ty FPConstant" << FPCounter++ << " = { 0x"
+        << utohexstr(p[0]) << "ULL, 0x" << utohexstr((uint16_t)p[1])
+        << ",{0,0,0}"
+        << "}; /* Long double constant */\n";
   } else if (FPC->getType() == Type::getPPC_FP128Ty(FPC->getContext()) ||
-      FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
+             FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
     const APInt api = FPC->getValueAPF().bitcastToAPInt();
     const uint64_t *p = api.getRawData();
-    Out << "static const ConstantFP128Ty FPConstant" << FPCounter++
-      << " = { 0x"
-      << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
-      << "}; /* Long double constant */\n";
+    Out << "static const ConstantFP128Ty FPConstant" << FPCounter++ << " = { 0x"
+        << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
+        << "}; /* Long double constant */\n";
 
   } else {
     llvm_unreachable("Unknown float type!");
   }
 }
 
-
 /// printSymbolTable - Run through symbol table looking for type names.  If a
 /// type name is found, emit its declaration...
 ///
@@ -2831,7 +3117,7 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   Out << "} llvmBitCastUnion;\n";
 
   // Keep track of which types have been printed so far.
-  std::set<Type*> TypesPrinted;
+  std::set<Type *> TypesPrinted;
 
   // Loop over all structures then push them into the stack so they are
   // printed in the correct order.
@@ -2840,8 +3126,9 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   // forward-declare all structs here first
 
   {
-    std::set<Type*> TypesPrinted;
-    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) {
+    std::set<Type *> TypesPrinted;
+    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end();
+         it != end; ++it) {
       forwardDeclareStructs(Out, *it, TypesPrinted);
     }
   }
@@ -2849,31 +3136,35 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   // forward-declare all function pointer typedefs (Issue #2)
 
   {
-    std::set<Type*> TypesPrinted;
-    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) {
+    std::set<Type *> TypesPrinted;
+    for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end();
+         it != end; ++it) {
       forwardDeclareFunctionTypedefs(Out, *it, TypesPrinted);
     }
   }
 
-
   Out << "\n/* Types Definitions */\n";
 
-  for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end(); it != end; ++it) {
+  for (auto it = TypedefDeclTypes.begin(), end = TypedefDeclTypes.end();
+       it != end; ++it) {
     printContainedTypes(Out, *it, TypesPrinted);
   }
 
   Out << "\n/* Function definitions */\n";
 
   // Question: Is UnnamedFunctionIDs ever non-empty?
-  for (DenseMap<std::pair<FunctionType*,
-      std::pair<AttributeList, CallingConv::ID> >, unsigned>::iterator
-      I = UnnamedFunctionIDs.begin(), E = UnnamedFunctionIDs.end();
-      I != E; ++I) {
+  for (DenseMap<
+           std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>>,
+           unsigned>::iterator I = UnnamedFunctionIDs.begin(),
+                               E = UnnamedFunctionIDs.end();
+       I != E; ++I) {
 
     Out << '\n';
-    std::pair<FunctionType*, std::pair<AttributeList, CallingConv::ID> > F = I->first;
+    std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>> F =
+        I->first;
     if (F.second.first == AttributeList() && F.second.second == CallingConv::C)
-      if (!TypesPrinted.insert(F.first).second) continue; // already printed this above
+      if (!TypesPrinted.insert(F.first).second)
+        continue; // already printed this above
 
     // FIXME: Removing apparently unused function call - need to check
     printFunctionDeclaration(Out, F.first, F.second);
@@ -2881,9 +3172,9 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
 
   // We may have collected some intrinsic prototypes to emit.
   // Emit them now, before the function that uses them is emitted
-  for (std::vector<Function*>::iterator
-      I = prototypesToGen.begin(), E = prototypesToGen.end();
-      I != E; ++I) {
+  for (std::vector<Function *>::iterator I = prototypesToGen.begin(),
+                                         E = prototypesToGen.end();
+       I != E; ++I) {
     Out << '\n';
     Function *F = *I;
     printFunctionProto(Out, F);
@@ -2891,9 +3182,12 @@ void CWriter::printModuleTypes(raw_ostream &Out) {
   }
 }
 
-void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted) {
-  if (!TypesPrinted.insert(Ty).second) return;
-  if (isEmptyType(Ty)) return;
+void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty,
+                                    std::set<Type *> &TypesPrinted) {
+  if (!TypesPrinted.insert(Ty).second)
+    return;
+  if (isEmptyType(Ty))
+    return;
 
   for (auto I = Ty->subtype_begin(); I != Ty->subtype_end(); ++I) {
     forwardDeclareStructs(Out, *I, TypesPrinted);
@@ -2904,9 +3198,12 @@ void CWriter::forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*>
   }
 }
 
-void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted) {
-  if (!TypesPrinted.insert(Ty).second) return;
-  if (isEmptyType(Ty)) return;
+void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty,
+                                             std::set<Type *> &TypesPrinted) {
+  if (!TypesPrinted.insert(Ty).second)
+    return;
+  if (isEmptyType(Ty))
+    return;
 
   for (auto I = Ty->subtype_begin(); I != Ty->subtype_end(); ++I) {
     forwardDeclareFunctionTypedefs(Out, *I, TypesPrinted);
@@ -2921,15 +3218,17 @@ void CWriter::forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty, std::se
 // this one depends on.
 //
 void CWriter::printContainedTypes(raw_ostream &Out, Type *Ty,
-    std::set<Type*> &TypesPrinted) {
+                                  std::set<Type *> &TypesPrinted) {
   // Check to see if we have already printed this struct.
-  if (!TypesPrinted.insert(Ty).second) return;
+  if (!TypesPrinted.insert(Ty).second)
+    return;
   // Skip empty structs
-  if (isEmptyType(Ty)) return;
+  if (isEmptyType(Ty))
+    return;
 
   // Print all contained types first.
-  for (Type::subtype_iterator I = Ty->subtype_begin(),
-      E = Ty->subtype_end(); I != E; ++I)
+  for (Type::subtype_iterator I = Ty->subtype_begin(), E = Ty->subtype_end();
+       I != E; ++I)
     printContainedTypes(Out, *I, TypesPrinted);
 
   if (StructType *ST = dyn_cast<StructType>(Ty)) {
@@ -2950,22 +3249,23 @@ static inline bool isFPIntBitCast(Instruction &I) {
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DstTy = I.getType();
   return (SrcTy->isFloatingPointTy() && DstTy->isIntegerTy()) ||
-    (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
+         (DstTy->isFloatingPointTy() && SrcTy->isIntegerTy());
 }
 
 void CWriter::printFunction(Function &F) {
   bool isKernel = false;
 
-  if (NamedMDNode * KernelMD = F.getParent()->getNamedMetadata("opencl.kernels")) {
+  if (NamedMDNode *KernelMD =
+          F.getParent()->getNamedMetadata("opencl.kernels")) {
     for (auto iter : KernelMD->operands()) {
       //      DEBUG( errs() << "Kernel Metadata: " << *iter << "\n");
       const MDOperand *KernelMDOp = iter->operands().begin();
       Metadata *KMD = KernelMDOp->get();
-      if(ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)){
+      if (ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)) {
         Value *KMDVal = KMDVAM->getValue();
         Function *KMDFunc = dyn_cast<Function>(KMDVal);
-        if(KMDFunc == &F) {
-          //DEBUG(errs() << "-->Kernel Func: " << KMDFunc->getName() << "\n");
+        if (KMDFunc == &F) {
+          // DEBUG(errs() << "-->Kernel Func: " << KMDFunc->getName() << "\n");
           isKernel = true;
         }
       }
@@ -2976,12 +3276,15 @@ void CWriter::printFunction(Function &F) {
   bool isStructReturn = F.hasStructRetAttr();
 
   assert(!F.isDeclaration());
-  if (F.hasDLLImportStorageClass()) Out << "__declspec(dllimport) ";
-  if (F.hasDLLExportStorageClass()) Out << "__declspec(dllexport) ";
-  if (F.hasLocalLinkage()) Out << "static ";
-  printFunctionProto(Out, F.getFunctionType(),
-      std::make_pair(F.getAttributes(), F.getCallingConv()),
-      GetValueName(&F),
+  if (F.hasDLLImportStorageClass())
+    Out << "__declspec(dllimport) ";
+  if (F.hasDLLExportStorageClass())
+    Out << "__declspec(dllexport) ";
+  if (F.hasLocalLinkage())
+    Out << "static ";
+  printFunctionProto(
+      Out, F.getFunctionType(),
+      std::make_pair(F.getAttributes(), F.getCallingConv()), GetValueName(&F),
       F.arg_begin(), // NOTE: replacing ArgumentList (LLVM-4) with arg iterator
       //&F.getArgumentList(),
       isKernel);
@@ -2991,9 +3294,10 @@ void CWriter::printFunction(Function &F) {
   // If this is a struct return function, handle the result with magic.
   if (isStructReturn) {
     Type *StructTy =
-      cast<PointerType>(F.arg_begin()->getType())->getElementType();
+        cast<PointerType>(F.arg_begin()->getType())->getElementType();
     Out << "  ";
-    printTypeName(Out, StructTy, false) << " StructReturn;  /* Struct return temporary */\n";
+    printTypeName(Out, StructTy, false)
+        << " StructReturn;  /* Struct return temporary */\n";
 
     Out << "  ";
     printTypeName(Out, F.arg_begin()->getType(), false);
@@ -3005,10 +3309,10 @@ void CWriter::printFunction(Function &F) {
   // print local variable information for the function
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
     if (AllocaInst *AI = isDirectAlloca(&*I)) {
-      //DEBUG(errs() << "Processing alloca inst: " << *AI << "\n");
+      // DEBUG(errs() << "Processing alloca inst: " << *AI << "\n");
       unsigned Alignment = AI->getAlignment();
-      bool IsOveraligned = Alignment &&
-        Alignment > TD->getABITypeAlignment(AI->getAllocatedType());
+      bool IsOveraligned = Alignment && Alignment > TD->getABITypeAlignment(
+                                                        AI->getAllocatedType());
       Out << "  ";
       //      if (IsOveraligned)
       //        Out << "__MSALIGN__(" << Alignment << ") ";
@@ -3017,21 +3321,22 @@ void CWriter::printFunction(Function &F) {
       if (IsOveraligned)
         Out << " __attribute__((aligned(" << Alignment << ")))";
       if (AI->isArrayAllocation()) {
-        //DEBUG(errs() << "Alloca is an array allocation!\n");
-        unsigned arraySize = dyn_cast<ConstantInt>(AI->getArraySize())->getZExtValue();
+        // DEBUG(errs() << "Alloca is an array allocation!\n");
+        unsigned arraySize =
+            dyn_cast<ConstantInt>(AI->getArraySize())->getZExtValue();
         Out << "[" << arraySize << "]";
       }
       Out << ";    /* Address-exposed local */\n";
       PrintedVar = true;
-    } else if (!isEmptyType(I->getType()) &&
-        !isInlinableInst(*I)) {
+    } else if (!isEmptyType(I->getType()) && !isInlinableInst(*I)) {
       Out << "  ";
       printTypeName(Out, I->getType(), false) << ' ' << GetValueName(&*I);
       Out << ";\n";
 
-      if (isa<PHINode>(*I)) {  // Print out PHI node temporaries as well...
+      if (isa<PHINode>(*I)) { // Print out PHI node temporaries as well...
         Out << "  ";
-        printTypeName(Out, I->getType(), false) << ' ' << (GetValueName(&*I)+"__PHI_TEMPORARY");
+        printTypeName(Out, I->getType(), false)
+            << ' ' << (GetValueName(&*I) + "__PHI_TEMPORARY");
         Out << ";\n";
       }
       PrintedVar = true;
@@ -3041,7 +3346,7 @@ void CWriter::printFunction(Function &F) {
     // variable to hold the result of the BitCast.
     if (isFPIntBitCast(*I)) {
       Out << "  llvmBitCastUnion " << GetValueName(&*I)
-        << "__BITCAST_TEMPORARY;\n";
+          << "__BITCAST_TEMPORARY;\n";
       PrintedVar = true;
     }
   }
@@ -3052,11 +3357,13 @@ void CWriter::printFunction(Function &F) {
   // print the basic blocks
   //  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
 
-  std::set<BasicBlock*> VisitSet;
-  BasicBlock* entry = &(F.getEntryBlock());
-  // starting printing from entry, then CFG traversal will print the reachable blocks.
+  std::set<BasicBlock *> VisitSet;
+  BasicBlock *entry = &(F.getEntryBlock());
+  // starting printing from entry, then CFG traversal will print the reachable
+  // blocks.
   printBBorLoop(entry);
-  //  for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry); BI!=BE; ++BI) { 
+  //  for (df_iterator<BasicBlock*> BI = df_begin(entry), BE = df_end(entry);
+  //  BI!=BE; ++BI) {
   //    BasicBlock *BB = *BI;
   //    printBBorLoop(BB);
   //    if(VisitedBlocks.find(BB) == VisitedBlocks.end()) {
@@ -3073,29 +3380,29 @@ void CWriter::printFunction(Function &F) {
   Out << "}\n\n";
 }
 
-
-bool CWriter::extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *IndVarChain, Instruction *Branch, unsigned indent) {
-  //Traverse def-use chain of induction variable to make sure that
-  //it ends at the branch. Keep stack of all instructions leading there.
-  for(User *U : Inst->users()) {
+bool CWriter::extractIndVarChain(Instruction *Inst,
+                                 std::stack<Instruction *> *IndVarChain,
+                                 Instruction *Branch, unsigned indent) {
+  // Traverse def-use chain of induction variable to make sure that
+  // it ends at the branch. Keep stack of all instructions leading there.
+  for (User *U : Inst->users()) {
     //    DEBUG(errs() << std::string(indent, '-'));
     //    DEBUG(errs() << "->Found user: " << *U << "\n");
-    if(Instruction *UInst = dyn_cast<Instruction>(U)) {
-      if(UInst == Branch) {
+    if (Instruction *UInst = dyn_cast<Instruction>(U)) {
+      if (UInst == Branch) {
         //        DEBUG(errs() << "Found correct path, returning!\n");
         return true;
-      }
-      else if (isa<PHINode>(UInst)) {
-        //        DEBUG(errs() << "Reached a PHI Node => Wrong path! Returning!\n");
+      } else if (isa<PHINode>(UInst)) {
+        //        DEBUG(errs() << "Reached a PHI Node => Wrong path!
+        //        Returning!\n");
         continue;
-      }
-      else {
+      } else {
         IndVarChain->push(UInst);
-        if(extractIndVarChain(UInst, IndVarChain, Branch, indent+2)) {
+        if (extractIndVarChain(UInst, IndVarChain, Branch, indent + 2)) {
           return true;
-        }
-        else {
-          //          DEBUG(errs() << "Wrong path, popping: " << *(IndVarChain->top()) << "\n");
+        } else {
+          //          DEBUG(errs() << "Wrong path, popping: " <<
+          //          *(IndVarChain->top()) << "\n");
           IndVarChain->pop();
         }
       }
@@ -3105,53 +3412,61 @@ bool CWriter::extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *In
   return false;
 }
 
-bool CWriter::findLoopBranch(BranchInst **LBranch, BasicBlock* CurBlock, BasicBlock* LHeader, std::set<BasicBlock*>*visitSet) {
+bool CWriter::findLoopBranch(BranchInst **LBranch, BasicBlock *CurBlock,
+                             BasicBlock *LHeader,
+                             std::set<BasicBlock *> *visitSet) {
   bool result = false;
-  //  DEBUG(errs() << "Finding loop branch in " << CurBlock->getName() << "!\n");
-  if(BranchInst *LBranchTemp = dyn_cast<BranchInst>(CurBlock->getTerminator())) {
+  //  DEBUG(errs() << "Finding loop branch in " << CurBlock->getName() <<
+  //  "!\n");
+  if (BranchInst *LBranchTemp =
+          dyn_cast<BranchInst>(CurBlock->getTerminator())) {
     //    DEBUG(errs() << "Branch: " << *LBranchTemp << "\n");
-    if(LBranchTemp->isConditional()) {
-      if(LBranchTemp->getSuccessor(0) == LHeader || LBranchTemp->getSuccessor(1) == LHeader) {
+    if (LBranchTemp->isConditional()) {
+      if (LBranchTemp->getSuccessor(0) == LHeader ||
+          LBranchTemp->getSuccessor(1) == LHeader) {
         *LBranch = LBranchTemp;
         //        DEBUG(errs() << "Found Loop branch: " << **LBranch << "\n");
         result = true;
       } else {
-        BasicBlock* NextBlock1 = LBranchTemp->getSuccessor(0);
-        BasicBlock* NextBlock2 = LBranchTemp->getSuccessor(1);
-        if(visitSet->find(NextBlock1) == visitSet->end()) {
-          //          DEBUG(errs() << "Visiting unvisited node: " << NextBlock1->getName() << "\n");
+        BasicBlock *NextBlock1 = LBranchTemp->getSuccessor(0);
+        BasicBlock *NextBlock2 = LBranchTemp->getSuccessor(1);
+        if (visitSet->find(NextBlock1) == visitSet->end()) {
+          //          DEBUG(errs() << "Visiting unvisited node: " <<
+          //          NextBlock1->getName() << "\n");
           visitSet->insert(NextBlock1);
           result |= findLoopBranch(LBranch, NextBlock1, LHeader, visitSet);
         }
-        if(visitSet->find(NextBlock2) == visitSet->end()) {
-          //          DEBUG(errs() << "Visiting unvisited node: " << NextBlock2->getName() << "\n");
+        if (visitSet->find(NextBlock2) == visitSet->end()) {
+          //          DEBUG(errs() << "Visiting unvisited node: " <<
+          //          NextBlock2->getName() << "\n");
           visitSet->insert(NextBlock2);
           result |= findLoopBranch(LBranch, NextBlock2, LHeader, visitSet);
         }
       }
 
     } else {
-      if(LBranchTemp->getSuccessor(0) == LHeader) {
+      if (LBranchTemp->getSuccessor(0) == LHeader) {
         *LBranch = LBranchTemp;
         //        DEBUG(errs() << "Found Loop branch: " << **LBranch << "\n");
         result = true;
       } else {
         BasicBlock *NextBlock = LBranchTemp->getSuccessor(0);
-        if(visitSet->find(NextBlock) == visitSet->end()) {
-          //          DEBUG(errs() << "Visiting unvisited node: " << NextBlock->getName() << "\n");
+        if (visitSet->find(NextBlock) == visitSet->end()) {
+          //          DEBUG(errs() << "Visiting unvisited node: " <<
+          //          NextBlock->getName() << "\n");
           visitSet->insert(NextBlock);
           result |= findLoopBranch(LBranch, NextBlock, LHeader, visitSet);
         }
       }
     }
   }
-  return result; 
+  return result;
 }
 
 bool CWriter::traverseUseDefChain(Instruction *I, PHINode *PI) {
   //  DEBUG(errs() << "traversing: " << *I << "\n");
   bool result = false;
-  if(PHINode *PHI = dyn_cast<PHINode>(I)) {
+  if (PHINode *PHI = dyn_cast<PHINode>(I)) {
     if (PI == PHI) {
       //      DEBUG(errs() << "returning true\n");
       result = true;
@@ -3162,9 +3477,9 @@ bool CWriter::traverseUseDefChain(Instruction *I, PHINode *PI) {
     }
   } else {
     for (Use &U : I->operands()) {
-      if(Instruction *UInst = dyn_cast<Instruction>(U)) {
+      if (Instruction *UInst = dyn_cast<Instruction>(U)) {
         result |= traverseUseDefChain(UInst, PI);
-      }  
+      }
     }
   }
   return result;
@@ -3186,19 +3501,20 @@ void CWriter::printLoop(Loop *L) {
   auto *ExitingBranch = ExitingBlock->getTerminator();
   //  DEBUG(errs() << "Exiting Branch: " << *ExitingBranch << "\n");
   InductionDescriptor ID;
-  if (L->getLoopPreheader()==nullptr) {
+  if (L->getLoopPreheader() == nullptr) {
     //    DEBUG(errs() << "Loop has no preheader!\n");
   }
   //  DEBUG(errs() << "Looking for induction variables\n");
   //  if (PHINode *IndVar = L->getCanonicalInductionVariable()) {
   //    InductionVariable = IndVar;
-  //    DEBUG(errs() << "Found canonical induction variable:\n" << *IndVar << "\n");
+  //    DEBUG(errs() << "Found canonical induction variable:\n" << *IndVar <<
+  //    "\n");
   //  }
   bool found = false;
   for (auto I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
     PHINode *PHI = cast<PHINode>(I);
     //    DEBUG(errs() << "Phi Node: " << *PHI << "\n");
-    if(InductionDescriptor::isInductionPHI(PHI,L,PSE,ID)) {
+    if (InductionDescriptor::isInductionPHI(PHI, L, PSE, ID)) {
       //      DEBUG(errs() << "Found induction: " << *PHI << "\n");
       InductionVariable = PHI;
       found = true;
@@ -3206,18 +3522,18 @@ void CWriter::printLoop(Loop *L) {
     }
   }
 
-  if(!found) {
+  if (!found) {
     llvm_unreachable("Couldn't find induction Variable in loop!\n");
   }
 
   LInductionVars.insert(InductionVariable);
-  LoopIndVarsMap.insert(std::pair<Loop*, PHINode*>(L,InductionVariable));
+  LoopIndVarsMap.insert(std::pair<Loop *, PHINode *>(L, InductionVariable));
 
   Value *IV = dyn_cast<Value>(InductionVariable);
   std::string IVName = GetValueName(IV);
 
   Optional<Loop::LoopBounds> OLB = L->getBounds(*SE);
-  if(OLB.hasValue()) {
+  if (OLB.hasValue()) {
     Loop::LoopBounds LB = OLB.getValue();
     Value *StartValue = &(LB.getInitialIVValue());
     Instruction *StepInstruction = &(LB.getStepInst());
@@ -3225,98 +3541,130 @@ void CWriter::printLoop(Loop *L) {
     Value *FinalValue = &(LB.getFinalIVValue());
     ICmpInst::Predicate LoopPredicate = LB.getCanonicalPredicate();
     std::string BranchPredicate;
-    switch(LoopPredicate) {
-      case ICmpInst::ICMP_EQ:  BranchPredicate = " == "; break;
-      case ICmpInst::ICMP_NE:  BranchPredicate = " != "; break;
-      case ICmpInst::ICMP_ULE:
-      case ICmpInst::ICMP_SLE: BranchPredicate = " < "; break;
-      case ICmpInst::ICMP_UGE:
-      case ICmpInst::ICMP_SGE: BranchPredicate = " > "; break;
-      case ICmpInst::ICMP_ULT:
-      case ICmpInst::ICMP_SLT: BranchPredicate = " <= "; break;
-      case ICmpInst::ICMP_UGT:
-      case ICmpInst::ICMP_SGT: BranchPredicate = " >= "; break;
-      default: llvm_unreachable("Illegal ICmp predicate");
+    switch (LoopPredicate) {
+    case ICmpInst::ICMP_EQ:
+      BranchPredicate = " == ";
+      break;
+    case ICmpInst::ICMP_NE:
+      BranchPredicate = " != ";
+      break;
+    case ICmpInst::ICMP_ULE:
+    case ICmpInst::ICMP_SLE:
+      BranchPredicate = " < ";
+      break;
+    case ICmpInst::ICMP_UGE:
+    case ICmpInst::ICMP_SGE:
+      BranchPredicate = " > ";
+      break;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_SLT:
+      BranchPredicate = " <= ";
+      break;
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_SGT:
+      BranchPredicate = " >= ";
+      break;
+    default:
+      llvm_unreachable("Illegal ICmp predicate");
     }
-    errs() << "IV: " << *IV<< "\n";
-    errs() << "StartValue: " << *StartValue<< "\n";
-    errs() << "StepInstruction: " << *StepInstruction<< "\n";
-    errs() << "StepValue: " << *StepValue<< "\n";
-    errs() << "FinalValue: " << *FinalValue<< "\n";
-    errs() << "Branch Predicate: " << BranchPredicate<< "\n";
-    errs() << "Direction: " << ((LB.getDirection() == Loop::LoopBounds::Direction::Increasing) 
-        ? "increasing" : "decreasing") << "\n";
-
-    std::string startStr; 
+    errs() << "IV: " << *IV << "\n";
+    errs() << "StartValue: " << *StartValue << "\n";
+    errs() << "StepInstruction: " << *StepInstruction << "\n";
+    errs() << "StepValue: " << *StepValue << "\n";
+    errs() << "FinalValue: " << *FinalValue << "\n";
+    errs() << "Branch Predicate: " << BranchPredicate << "\n";
+    errs() << "Direction: "
+           << ((LB.getDirection() == Loop::LoopBounds::Direction::Increasing)
+                   ? "increasing"
+                   : "decreasing")
+           << "\n";
+
+    std::string startStr;
     if (ConstantInt *startConst = dyn_cast<ConstantInt>(StartValue)) {
       startStr = std::to_string(startConst->getSExtValue());
     } else {
       startStr = GetValueName(StartValue);
     }
-    std::string finalStr; 
+    std::string finalStr;
     if (ConstantInt *finalConst = dyn_cast<ConstantInt>(FinalValue)) {
       finalStr = std::to_string(finalConst->getSExtValue());
     } else {
       finalStr = GetValueName(FinalValue);
     }
-    std::string stepStr; 
+    std::string stepStr;
     if (ConstantInt *stepConst = dyn_cast<ConstantInt>(StepValue)) {
       stepStr = std::to_string(stepConst->getSExtValue());
     } else {
       stepStr = GetValueName(StepValue);
     }
 
-    errs() << "\n  for ( " << IVName << " = " << startStr << "; " 
-      << IVName << BranchPredicate << finalStr << "; " 
-      << IVName << " = " << IVName << " + " << stepStr << ") {\n";
+    errs() << "\n  for ( " << IVName << " = " << startStr << "; " << IVName
+           << BranchPredicate << finalStr << "; " << IVName << " = " << IVName
+           << " + " << stepStr << ") {\n";
 
-    Out << "\n  for ( " << IVName << " = " << startStr << "; " 
-      << IVName << BranchPredicate << finalStr << "; " 
-      << IVName << " = " << IVName << " + " << stepStr << ") {\n";
+    Out << "\n  for ( " << IVName << " = " << startStr << "; " << IVName
+        << BranchPredicate << finalStr << "; " << IVName << " = " << IVName
+        << " + " << stepStr << ") {\n";
 
   } else {
     llvm_unreachable("No Loop Bounds!");
     Value *StartValue = ID.getStartValue();
     const SCEV *Step = ID.getStep();
-    //  unsigned IterationCount = SE->getSmallConstantMaxTripCount(L); 
-    //  DEBUG(errs() << "StartValue: " << *StartValue << "\nStep: " << *Step << "\nIterationCount: " << IterationCount << "\n");
+    //  unsigned IterationCount = SE->getSmallConstantMaxTripCount(L);
+    //  DEBUG(errs() << "StartValue: " << *StartValue << "\nStep: " << *Step <<
+    //  "\nIterationCount: " << IterationCount << "\n");
 
     std::string IVOp;
 
     if (const SCEVConstant *stepConst = dyn_cast<SCEVConstant>(Step)) {
-      if(stepConst->getAPInt().isNonNegative()) {
-        IVOp = " + ";  
+      if (stepConst->getAPInt().isNonNegative()) {
+        IVOp = " + ";
       }
     }
 
-
     std::string BranchPredicate;
-    ICmpInst *BranchCondition = dyn_cast<ICmpInst>(dyn_cast<BranchInst>(ExitingBranch)->getCondition());
-    switch(BranchCondition->getPredicate()) {
-      case ICmpInst::ICMP_EQ:  BranchPredicate = " != "; break;
-      case ICmpInst::ICMP_NE:  BranchPredicate = " == "; break;
-      case ICmpInst::ICMP_ULE:
-      case ICmpInst::ICMP_SLE: BranchPredicate = " > "; break;
-      case ICmpInst::ICMP_UGE:
-      case ICmpInst::ICMP_SGE: BranchPredicate = " < "; break;
-      case ICmpInst::ICMP_ULT:
-      case ICmpInst::ICMP_SLT: BranchPredicate = " >= "; break;
-      case ICmpInst::ICMP_UGT:
-      case ICmpInst::ICMP_SGT: BranchPredicate = " <= "; break;
-      default: llvm_unreachable("Illegal ICmp predicate");
-    }
-
-    //  DEBUG(errs() << "Branch Condition: " << *BranchCondition << "\n");
+    ICmpInst *BranchCondition =
+        dyn_cast<ICmpInst>(dyn_cast<BranchInst>(ExitingBranch)->getCondition());
+    switch (BranchCondition->getPredicate()) {
+    case ICmpInst::ICMP_EQ:
+      BranchPredicate = " != ";
+      break;
+    case ICmpInst::ICMP_NE:
+      BranchPredicate = " == ";
+      break;
+    case ICmpInst::ICMP_ULE:
+    case ICmpInst::ICMP_SLE:
+      BranchPredicate = " > ";
+      break;
+    case ICmpInst::ICMP_UGE:
+    case ICmpInst::ICMP_SGE:
+      BranchPredicate = " < ";
+      break;
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_SLT:
+      BranchPredicate = " >= ";
+      break;
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_SGT:
+      BranchPredicate = " <= ";
+      break;
+    default:
+      llvm_unreachable("Illegal ICmp predicate");
+    }
+
+    //  DEBUG(errs() << "Branch Condition: " << *BranchCondition << "\n");
 
     std::string compLHS, compRHS;
     Value *CondOp1 = BranchCondition->getOperand(0);
     //  DEBUG(errs() << "CondOp1: " << *CondOp1 << "\n");
     if (Constant *constOp1 = dyn_cast<Constant>(CondOp1)) {
-      //    DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n");
-      compLHS = (constOp1->getUniqueInteger()).toString(10,1);
+      //    DEBUG(errs() << "Condition Operand is a constant, inserting it as
+      //    is.\n");
+      compLHS = (constOp1->getUniqueInteger()).toString(10, 1);
     } else {
       //    DEBUG(errs() << "Condition Operand is not a constant, ");
-      if(traverseUseDefChain(dyn_cast<Instruction>(CondOp1), InductionVariable)) {
+      if (traverseUseDefChain(dyn_cast<Instruction>(CondOp1),
+                              InductionVariable)) {
         //      DEBUG(errs() << "it is the IV.\n");
         compLHS = GetValueName(IV);
       } else {
@@ -3327,11 +3675,13 @@ void CWriter::printLoop(Loop *L) {
     Value *CondOp2 = BranchCondition->getOperand(1);
     //  DEBUG(errs() << "CondOp2: " << *CondOp2 << "\n");
     if (Constant *constOp2 = dyn_cast<Constant>(CondOp2)) {
-      //    DEBUG(errs() << "Condition Operand is a constant, inserting it as is.\n");
-      compRHS = (constOp2->getUniqueInteger()).toString(10,1);
+      //    DEBUG(errs() << "Condition Operand is a constant, inserting it as
+      //    is.\n");
+      compRHS = (constOp2->getUniqueInteger()).toString(10, 1);
     } else {
       //    DEBUG(errs() << "Condition Operand is not a constant.\n");
-      if(traverseUseDefChain(dyn_cast<Instruction>(CondOp2), InductionVariable)) {
+      if (traverseUseDefChain(dyn_cast<Instruction>(CondOp2),
+                              InductionVariable)) {
         //      DEBUG(errs() << "It is the IV.\n");
         compRHS = GetValueName(IV);
       } else {
@@ -3340,24 +3690,22 @@ void CWriter::printLoop(Loop *L) {
       }
     }
 
-    std::string startStr; 
+    std::string startStr;
     if (Constant *startConst = dyn_cast<Constant>(StartValue)) {
-      startStr = (startConst->getUniqueInteger()).toString(10,1);
+      startStr = (startConst->getUniqueInteger()).toString(10, 1);
     } else {
       startStr = GetValueName(StartValue);
     }
 
-
-    //  DEBUG(errs() << "  for ( " << IVName << " = " << startStr << "; " 
-    //    << compLHS << BranchPredicate << compRHS << "; " 
+    //  DEBUG(errs() << "  for ( " << IVName << " = " << startStr << "; "
+    //    << compLHS << BranchPredicate << compRHS << "; "
     //    << IVName << " = " << IVName << IVOp << *Step << ") {\n");
 
-    Out << "\n  for ( " << IVName << " = " << startStr << "; " 
-      << compLHS << BranchPredicate << compRHS << "; " 
-      << IVName << " = " << IVName << IVOp << *Step << ") {\n";
+    Out << "\n  for ( " << IVName << " = " << startStr << "; " << compLHS
+        << BranchPredicate << compRHS << "; " << IVName << " = " << IVName
+        << IVOp << *Step << ") {\n";
   }
 
-
   BasicBlock *BB = L->getHeader();
   //  printBBorLoop(BB);
   printBasicBlock(BB);
@@ -3381,7 +3729,7 @@ void CWriter::printLoop(Loop *L) {
 }
 
 void CWriter::printBasicBlock(BasicBlock *BB) {
-  //DEBUG(errs() << "\n\nProcessing Basic Block: " << BB->getName() << "\n");
+  // DEBUG(errs() << "\n\nProcessing Basic Block: " << BB->getName() << "\n");
   Out << "\n\n/* Processing Basic Block: " << BB->getName() << " */\n";
 
   // Don't print the label for the basic block if there are no uses, or if
@@ -3400,19 +3748,19 @@ void CWriter::printBasicBlock(BasicBlock *BB) {
   Out << "/* " << GetValueName(BB) << ": */\n";
 
   // Output all of the instructions in the basic block...
-  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
-      ++II) {
+  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E; ++II) {
     Instruction *I = &*II;
-    //DEBUG(errs() << "*********Processing: " << *I << "\n");
+    // DEBUG(errs() << "*********Processing: " << *I << "\n");
     bool skip = false;
-    for(Use &U : I->operands()) {
+    for (Use &U : I->operands()) {
       Value *v = U.get();
-      if(PHINode *PN = dyn_cast<PHINode>(v)) {
+      if (PHINode *PN = dyn_cast<PHINode>(v)) {
         if (LInductionVars.find(PN) != LInductionVars.end()) {
           bool UserPHI = false;
           bool UserCMP = false;
           bool UserOTHER = false;
-          ////          DEBUG(errs() << "Instruction uses induction variable\n");
+          ////          DEBUG(errs() << "Instruction uses induction
+          /// variable\n");
           for (User *IUser : I->users()) {
             if (Instruction *UserInst = dyn_cast<Instruction>(IUser)) {
               //              DEBUG(errs() << "User: " << *UserInst << "\n");
@@ -3435,28 +3783,28 @@ void CWriter::printBasicBlock(BasicBlock *BB) {
       if (skip)
         break;
     }
-    if(skip){ 
-      //      DEBUG(errs() << "Skipping instruction that increments Induction Variable!\n");
+    if (skip) {
+      //      DEBUG(errs() << "Skipping instruction that increments Induction
+      //      Variable!\n");
       Out << "/* Skipped induction variable use: " << *I << " */\n";
       continue;
     }
-    if(PHINode *PN = dyn_cast<PHINode>(I)) {
-      if (LInductionVars.find(PN) != LInductionVars.end()) { 
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      if (LInductionVars.find(PN) != LInductionVars.end()) {
         //        DEBUG(errs() << "Skipping PHINode for Induction Variable!\n");
         Out << "/* PHINode of induction variable was here */\n";
         continue;
       }
     }
     if (!isInlinableInst(*II) && !isDirectAlloca(&*II)) {
-      if (!isEmptyType(II->getType()) &&
-          !isInlineAsm(*II))
+      if (!isEmptyType(II->getType()) && !isInlineAsm(*II))
         outputLValue(&*II);
       else
         Out << "  ";
       writeInstComputationInline(*II);
       Out << ";\n";
     } else {
-      //DEBUG(errs() << "Skipping inlinable or direct alloca!\n");
+      // DEBUG(errs() << "Skipping inlinable or direct alloca!\n");
     }
   }
 
@@ -3464,7 +3812,6 @@ void CWriter::printBasicBlock(BasicBlock *BB) {
   visit(*BB->getTerminator());
 }
 
-
 // Specific Instruction type classes... note that all of the casts are
 // necessary because we use the instruction classes as opaque types...
 //
@@ -3494,11 +3841,11 @@ void CWriter::visitReturnInst(ReturnInst &I) {
 }
 
 void CWriter::visitSwitchInst(SwitchInst &SI) {
-  Value* Cond = SI.getCondition();
+  Value *Cond = SI.getCondition();
   unsigned NumBits = cast<IntegerType>(Cond->getType())->getBitWidth();
 
   if (SI.getNumCases() == 0) { // unconditional branch
-    printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
+    printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2);
     printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
     Out << "\n";
 
@@ -3506,18 +3853,17 @@ void CWriter::visitSwitchInst(SwitchInst &SI) {
     Out << "  switch (";
     writeOperand(Cond);
     Out << ") {\n  default:\n";
-    printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
+    printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2);
     printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
 
-
     // CHECK: Needs much testing
     for (auto Case : SI.cases()) {
-      ConstantInt* CaseVal = Case.getCaseValue();
-      BasicBlock* Succ = Case.getCaseSuccessor();
+      ConstantInt *CaseVal = Case.getCaseValue();
+      BasicBlock *Succ = Case.getCaseSuccessor();
       Out << "  case ";
       writeOperand(CaseVal);
       Out << ":\n";
-      printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
+      printPHICopiesForSuccessor(SI.getParent(), Succ, 2);
       if (isGotoCodeNecessary(SI.getParent(), Succ))
         printBranchToBlock(SI.getParent(), Succ, 2);
       else
@@ -3530,18 +3876,18 @@ void CWriter::visitSwitchInst(SwitchInst &SI) {
     // CHECK: Needs much testing
     for (auto Case : SI.cases()) {
       Out << "if (";
-      ConstantInt* CaseVal = Case.getCaseValue();
-      BasicBlock* Succ = Case.getCaseSuccessor();
+      ConstantInt *CaseVal = Case.getCaseValue();
+      BasicBlock *Succ = Case.getCaseSuccessor();
       ICmpInst *icmp = new ICmpInst(CmpInst::ICMP_EQ, Cond, CaseVal);
       visitICmpInst(*icmp);
       delete icmp;
       Out << ") {\n";
-      printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
+      printPHICopiesForSuccessor(SI.getParent(), Succ, 2);
       printBranchToBlock(SI.getParent(), Succ, 2);
       Out << "  } else ";
     }
     Out << "{\n";
-    printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
+    printPHICopiesForSuccessor(SI.getParent(), SI.getDefaultDest(), 2);
     printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
     Out << "  }\n";
   }
@@ -3563,22 +3909,23 @@ bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
   return true;
 
   if (std::next(Function::iterator(From)) != Function::iterator(To))
-    return true;  // Not the direct successor, we need a goto.
+    return true; // Not the direct successor, we need a goto.
 
-  //isa<SwitchInst>(From->getTerminator())
+  // isa<SwitchInst>(From->getTerminator())
 
   if (LI->getLoopFor(From) != LI->getLoopFor(To))
     return true;
   return false;
 }
 
-void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
-    BasicBlock *Successor,
-    unsigned Indent) {
-  Out << "/* Printing PHIs for " << CurBlock->getName() << "->" << Successor->getName() << " */\n";
+void CWriter::printPHICopiesForSuccessor(BasicBlock *CurBlock,
+                                         BasicBlock *Successor,
+                                         unsigned Indent) {
+  Out << "/* Printing PHIs for " << CurBlock->getName() << "->"
+      << Successor->getName() << " */\n";
   for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
     PHINode *PN = cast<PHINode>(I);
-    if(LInductionVars.find(PN) == LInductionVars.end()) {
+    if (LInductionVars.find(PN) == LInductionVars.end()) {
       Out << "/* Printing phi node: " << *PN << " */\n";
       // Now we have to do the printing.
       Value *IV = PN->getIncomingValueForBlock(CurBlock);
@@ -3595,7 +3942,7 @@ void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
 }
 
 void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
-    unsigned Indent) {
+                                 unsigned Indent) {
   if (isGotoCodeNecessary(CurBB, Succ)) {
     Out << std::string(Indent, ' ') << "  goto ";
     writeOperand(Succ);
@@ -3603,76 +3950,89 @@ void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
   }
 }
 
-void CWriter::printBBorLoop (BasicBlock *BB) {
-  //DEBUG(errs() << "\nPrinting: " << BB->getName() << "\n");
+void CWriter::printBBorLoop(BasicBlock *BB) {
+  // DEBUG(errs() << "\nPrinting: " << BB->getName() << "\n");
   Out << "\n/* Printing: " << BB->getName() << " */\n";
-  if(VisitedBlocks.find(BB)!=VisitedBlocks.end() && ReplicateBlocks.find(BB)==ReplicateBlocks.end()) {
-    //DEBUG(errs() << "This BB has already been printed and is not marked for replication! exiting!\n");
-    Out << "/* This BB has already been printed and is not marked for replication! exiting! */\n";
-  } else if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == BB) {
-    //DEBUG(errs() << "Reached block that is top of stack, return instead!\n");
+  if (VisitedBlocks.find(BB) != VisitedBlocks.end() &&
+      ReplicateBlocks.find(BB) == ReplicateBlocks.end()) {
+    // DEBUG(errs() << "This BB has already been printed and is not marked for
+    // replication! exiting!\n");
+    Out << "/* This BB has already been printed and is not marked for "
+           "replication! exiting! */\n";
+  } else if (!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == BB) {
+    // DEBUG(errs() << "Reached block that is top of stack, return instead!\n");
     Out << "/* " << BB->getName() << " is top of stack, return instead! */\n";
     //    ImmPostDommBlocks.pop();
   } else {
     VisitedBlocks.insert(BB);
-    if(Loop *LL = LI->getLoopFor(BB)) {
+    if (Loop *LL = LI->getLoopFor(BB)) {
       if (LL->getHeader() == BB)
         printLoop(LL);
-      else 
+      else
         printBasicBlock(BB);
     } else {
       printBasicBlock(BB);
     }
   }
-
 }
 
-bool CWriter::compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm) {
+bool CWriter::compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock,
+                            BasicBlock *ImmPostDomm) {
   CompVisitedBlocks.insert(CurrBlock);
-  //DEBUG(errs() << "--Comparing " << CurrBlock->getName() << " with " << CompBlock->getName() << "\n");
+  // DEBUG(errs() << "--Comparing " << CurrBlock->getName() << " with " <<
+  // CompBlock->getName() << "\n");
   if (CurrBlock == ImmPostDomm) {
-    //DEBUG(errs() << "----Reached Post Dominator, returning false!\n");
+    // DEBUG(errs() << "----Reached Post Dominator, returning false!\n");
     return false;
   } else if (CurrBlock == CompBlock) {
-    //DEBUG(errs() << "----Found a match! " << CurrBlock->getName() << " == " << CompBlock->getName() << "\n");
+    // DEBUG(errs() << "----Found a match! " << CurrBlock->getName() << " == "
+    // << CompBlock->getName() << "\n");
     return true;
   } else {
     bool res = false;
-    for (auto succ: successors(CurrBlock)) {
+    for (auto succ : successors(CurrBlock)) {
       if (CompVisitedBlocks.find(succ) == CompVisitedBlocks.end()) {
-        //DEBUG(errs() << "----Visiting successor " << succ->getName() << " of " << CurrBlock->getName() << "\n");
+        // DEBUG(errs() << "----Visiting successor " << succ->getName() << " of
+        // " << CurrBlock->getName() << "\n");
         res = res || compareBlocks(succ, CompBlock, ImmPostDomm);
       } else {
-        //DEBUG(errs() << "----Skipping successor " << succ->getName() << " of " << CurrBlock->getName() << "\n");
+        // DEBUG(errs() << "----Skipping successor " << succ->getName() << " of
+        // " << CurrBlock->getName() << "\n");
       }
     }
     return res;
   }
 }
 
-bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm) {
+bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock,
+                        BasicBlock *ImmPostDomm) {
   if (CompBlock == ImmPostDomm) {
-    //DEBUG(errs() << "Reached PostDomm; returning!\n");
+    // DEBUG(errs() << "Reached PostDomm; returning!\n");
     return false;
   }
   FindVisitedBlocks.insert(CompBlock);
-  //DEBUG(errs() << "Finding match between " << CompBlock->getName() << " & " << CurrBlock->getName() << "\n");
+  // DEBUG(errs() << "Finding match between " << CompBlock->getName() << " & "
+  // << CurrBlock->getName() << "\n");
   bool compareResult = compareBlocks(CurrBlock, CompBlock, ImmPostDomm);
   CompVisitedBlocks.clear();
-  if (compareResult){
-    //DEBUG(errs() << "Match found, marking " << CompBlock->getName() << " for replication!\n");
+  if (compareResult) {
+    // DEBUG(errs() << "Match found, marking " << CompBlock->getName() << " for
+    // replication!\n");
     // Flag for replication
     ReplicateBlocks.insert(CompBlock);
     return true;
   } else {
     bool res = false;
-    for (auto succ: successors(CompBlock)) {
-      if(FindVisitedBlocks.find(succ) == FindVisitedBlocks.end()) {
-        //DEBUG(errs() << "Visiting successor " << succ->getName() << " of " << CompBlock->getName() << "\n");
+    for (auto succ : successors(CompBlock)) {
+      if (FindVisitedBlocks.find(succ) == FindVisitedBlocks.end()) {
+        // DEBUG(errs() << "Visiting successor " << succ->getName() << " of " <<
+        // CompBlock->getName() << "\n");
         res = res || findMatch(CurrBlock, succ, ImmPostDomm);
-        if (res == true) break;
+        if (res == true)
+          break;
       } else {
-        //DEBUG(errs() << "Skipping successor " << succ->getName() << " of " << CompBlock->getName() << "\n");
+        // DEBUG(errs() << "Skipping successor " << succ->getName() << " of " <<
+        // CompBlock->getName() << "\n");
       }
     }
     return res;
@@ -3682,13 +4042,13 @@ bool CWriter::findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock
 // that immediately succeeds the current one.
 //
 void CWriter::visitBranchInst(BranchInst &I) {
-  errs() << "Visiting Branch Instruction: " << I <<"\n";
+  errs() << "Visiting Branch Instruction: " << I << "\n";
   Out << "\n/* Branch: " << I << " */\n";
 
   if (I.isConditional()) {
     BasicBlock *BB0 = I.getSuccessor(0);
     BasicBlock *BB1 = I.getSuccessor(1);
-    BasicBlock *ImmPostDomm = PDT->findNearestCommonDominator(BB0,BB1);
+    BasicBlock *ImmPostDomm = PDT->findNearestCommonDominator(BB0, BB1);
 
     // Iterate over all BBs in then & else to find a matching BB
     // If found, mark it for replication
@@ -3696,166 +4056,189 @@ void CWriter::visitBranchInst(BranchInst &I) {
       findMatch(BB0, BB1, ImmPostDomm);
       FindVisitedBlocks.clear();
     }
-    if(Loop *L = LI->getLoopFor(I.getParent())) {
-      if(L == LI->getLoopFor(BB0) && !(L == LI->getLoopFor(BB1))) {
+    if (Loop *L = LI->getLoopFor(I.getParent())) {
+      if (L == LI->getLoopFor(BB0) && !(L == LI->getLoopFor(BB1))) {
         errs() << "This is a loop branch!\n";
         Out << "/* This is a loop branch! */\n";
-        //BB0 is in the loop. Print it if it hsn't been printed
-        if(VisitedBlocks.find(BB0) != VisitedBlocks.end()) {
+        // BB0 is in the loop. Print it if it hsn't been printed
+        if (VisitedBlocks.find(BB0) != VisitedBlocks.end()) {
           errs() << "Branching back to header: " << BB0->getName() << "\n";
           errs() << "This is the end of the loop, closing!\n";
           Out << "/* Branching back to header: " << BB0->getName() << " */\n";
           Out << "/* Closing loop! */\n";
-          //BB0 is the loop header. CLose the loop then print BB1.
-          printPHICopiesForSuccessor (I.getParent(), BB0, 2);
+          // BB0 is the loop header. CLose the loop then print BB1.
+          printPHICopiesForSuccessor(I.getParent(), BB0, 2);
           Out << " }\n";
-          printPHICopiesForSuccessor (I.getParent(), BB1, 2);
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
           printBBorLoop(BB1);
         } else {
-          errs() << "Not branching to header! Branching to: " << BB0->getName() << "\n";
-          //BB0 is not the loop header. That means we are entering loop body
+          errs() << "Not branching to header! Branching to: " << BB0->getName()
+                 << "\n";
+          // BB0 is not the loop header. That means we are entering loop body
 
           llvm_unreachable("loop branch unhandled!\n");
         }
-      } else if(L == LI->getLoopFor(BB1) && !(L == LI->getLoopFor(BB0))) {
+      } else if (L == LI->getLoopFor(BB1) && !(L == LI->getLoopFor(BB0))) {
         errs() << "This is a loop branch!\n";
         Out << "/* This is a loop branch! */\n";
-        if(VisitedBlocks.find(BB1) != VisitedBlocks.end()) {
+        if (VisitedBlocks.find(BB1) != VisitedBlocks.end()) {
           errs() << "Branching back to header: " << BB1->getName() << "\n";
           errs() << "This is the end of the loop, closing!\n";
           Out << "/* Branching back to header: " << BB1->getName() << " */\n";
           Out << "/* Closing loop! */\n";
-          //BB0 is the loop header. CLose the loop then print BB1.
-          printPHICopiesForSuccessor (I.getParent(), BB1, 2);
+          // BB0 is the loop header. CLose the loop then print BB1.
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
           Out << " }\n";
-          printPHICopiesForSuccessor (I.getParent(), BB0, 2);
+          printPHICopiesForSuccessor(I.getParent(), BB0, 2);
           printBBorLoop(BB0);
         } else {
-          errs() << "Not branching to header! Branching to: " << BB1->getName() << "\n";
-          //BB1 is not the loop header. That means we are entering loop body
+          errs() << "Not branching to header! Branching to: " << BB1->getName()
+                 << "\n";
+          // BB1 is not the loop header. That means we are entering loop body
           llvm_unreachable("loop branch unhandled!\n");
         }
       } else {
         errs() << "This is a conditional statement within a loop!\n";
         Out << "/* This is a conditional statement within a loop! */\n";
-        errs() << ImmPostDomm->getName() << " is the immediate post dominator of " << BB0->getName() << " and " << BB1->getName() << "\n";
-        if(VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
-          errs() << "Not pushing " << ImmPostDomm->getName() << " because it has already been visited!\n";
+        errs() << ImmPostDomm->getName()
+               << " is the immediate post dominator of " << BB0->getName()
+               << " and " << BB1->getName() << "\n";
+        if (VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
+          errs() << "Not pushing " << ImmPostDomm->getName()
+                 << " because it has already been visited!\n";
         } else {
           errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n";
           ImmPostDommBlocks.push(ImmPostDomm);
         }
 
         bool noElse = false;
-        if(BB1 == ImmPostDomm) {
+        if (BB1 == ImmPostDomm) {
           noElse = true;
         }
         Out << "  if (";
         writeOperand(I.getCondition(), ContextCasted);
         Out << ") { /* " << I << "*/\n";
-        printPHICopiesForSuccessor (I.getParent(), BB0, 2);
+        printPHICopiesForSuccessor(I.getParent(), BB0, 2);
         printBBorLoop(BB0);
-        errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
-        Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n";
+        errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+               << "\n";
+        Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+            << " */\n";
         if (!noElse) {
           errs() << "Printing else!\n";
           Out << "  } else { /*" << I << "*/\n";
-          printPHICopiesForSuccessor (I.getParent(), BB1, 2);
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
           ElseBlocks.push(BB1);
           ElseBranches.push(&I);
           printBBorLoop(BB1);
-          errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
+          errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+                 << "\n";
           errs() << "Check to see if else block is closed!\n";
-          Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n" ;
-          Out << "/* Check to see if else block is closed! */\n" ;
-          if(!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
+          Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+              << " */\n";
+          Out << "/* Check to see if else block is closed! */\n";
+          if (!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
             errs() << "Else block not closed, need to close braces!\n";
-            Out << "/* Else block not closed, need to close braces! */\n" ;
+            Out << "/* Else block not closed, need to close braces! */\n";
             Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
             ElseBranches.pop();
             ElseBlocks.pop();
           }
-          if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == ImmPostDomm) {
+          if (!ImmPostDommBlocks.empty() &&
+              ImmPostDommBlocks.top() == ImmPostDomm) {
             errs() << "Will now pop post dom them handle it!\n";
             ImmPostDommBlocks.pop();
             printBBorLoop(ImmPostDomm);
           } else {
-            errs() << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
+            errs()
+                << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
           }
         } else {
-          errs() << "No else block. Adding one for phis, then moving to " << BB1->getName() << "!\n";
-          Out << "/* (3913) No else block. Adding one for phis, then moving to " << BB1->getName() << "! */\n";
+          errs() << "No else block. Adding one for phis, then moving to "
+                 << BB1->getName() << "!\n";
+          Out << "/* (3913) No else block. Adding one for phis, then moving to "
+              << BB1->getName() << "! */\n";
           Out << "  } /* closing " << I << "*/\n";
           errs() << "Will now pop post dom them handle it!\n";
           ImmPostDommBlocks.pop();
           Out << "else {\n";
-          printPHICopiesForSuccessor (I.getParent(), BB1, 2);
+          printPHICopiesForSuccessor(I.getParent(), BB1, 2);
           Out << "}\n";
           printBBorLoop(BB1);
         }
       }
     } else {
       errs() << "This is a conditional statement!\n";
-      errs() << ImmPostDomm->getName() << " is the immediate post dominator of " << BB0->getName() << " and " << BB1->getName() << "\n";
-      if(VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
-        errs() << "Not pushing " << ImmPostDomm->getName() << " because it has already been visited!\n";
+      errs() << ImmPostDomm->getName() << " is the immediate post dominator of "
+             << BB0->getName() << " and " << BB1->getName() << "\n";
+      if (VisitedBlocks.find(ImmPostDomm) != VisitedBlocks.end()) {
+        errs() << "Not pushing " << ImmPostDomm->getName()
+               << " because it has already been visited!\n";
       } else {
         errs() << "Pushing " << ImmPostDomm->getName() << " onto stack!\n";
         ImmPostDommBlocks.push(ImmPostDomm);
       }
       bool noElse = false;
-      if(BB1 == ImmPostDomm) {
+      if (BB1 == ImmPostDomm) {
         noElse = true;
       }
       Out << "  if (";
       writeOperand(I.getCondition(), ContextCasted);
       Out << ") { /* " << I << "*/\n";
-      printPHICopiesForSuccessor (I.getParent(), BB0, 2);
+      printPHICopiesForSuccessor(I.getParent(), BB0, 2);
       printBBorLoop(BB0);
-      errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
-      Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n" ;
+      errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+             << "\n";
+      Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+          << " */\n";
       if (!noElse) {
         errs() << "Printing else!\n";
-        Out << "/* Printing else! */\n" ;
+        Out << "/* Printing else! */\n";
         Out << "  } else { /*" << I << "*/\n";
-        printPHICopiesForSuccessor (I.getParent(), BB1, 2);
+        printPHICopiesForSuccessor(I.getParent(), BB1, 2);
         ElseBlocks.push(BB1);
         ElseBranches.push(&I);
         printBBorLoop(BB1);
-        errs() << "Back to handling " << I.getParent()->getName() << ": " << I << "\n";
+        errs() << "Back to handling " << I.getParent()->getName() << ": " << I
+               << "\n";
         errs() << "Check to see if else block is closed!\n";
-        Out << "/* Back to handling " << I.getParent()->getName() << ": " << I << " */\n";
+        Out << "/* Back to handling " << I.getParent()->getName() << ": " << I
+            << " */\n";
         Out << "/* Check to see if else block is closed! */\n";
-        if(!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
+        if (!ElseBlocks.empty() && ElseBlocks.top() == BB1) {
           errs() << "Else block not closed, need to close braces!\n";
           Out << "/* Else block not closed, need to close braces! */\n";
           Out << "} /* closing " << *(ElseBranches.top()) << " */\n";
           ElseBranches.pop();
           ElseBlocks.pop();
         }
-        if(!ImmPostDommBlocks.empty() && ImmPostDommBlocks.top() == ImmPostDomm) {
+        if (!ImmPostDommBlocks.empty() &&
+            ImmPostDommBlocks.top() == ImmPostDomm) {
           errs() << "Will now pop post dom them handle it!\n";
           ImmPostDommBlocks.pop();
           printBBorLoop(ImmPostDomm);
         } else {
-          errs() << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
+          errs()
+              << "*!*!*!*!*!*!Not sure what is happening here!*!*!*!*!*!*!\n";
         }
       } else {
-        errs() << "No else block. Adding one for phis, then moving to " << BB1->getName() << "!\n";
-        Out << "/* (3985) No else block. Adding one for phis, then moving to " << BB1->getName() << "! */\n";
+        errs() << "No else block. Adding one for phis, then moving to "
+               << BB1->getName() << "!\n";
+        Out << "/* (3985) No else block. Adding one for phis, then moving to "
+            << BB1->getName() << "! */\n";
         Out << "  } /* closing " << I << "*/\n";
         errs() << "Will now pop post dom them handle it!\n";
         ImmPostDommBlocks.pop();
         Out << "else {\n";
-        printPHICopiesForSuccessor (I.getParent(), BB1, 2);
+        printPHICopiesForSuccessor(I.getParent(), BB1, 2);
         Out << "}\n";
         printBBorLoop(BB1);
       }
     }
   } else {
     errs() << "This is an unconditional branch!\n";
-    BasicBlock *BB = I.getSuccessor(0); 
-    printPHICopiesForSuccessor (I.getParent(), BB, 2);
+    BasicBlock *BB = I.getSuccessor(0);
+    printPHICopiesForSuccessor(I.getParent(), BB, 2);
     if (!ElseBlocks.empty() && I.getParent() == ElseBlocks.top()) {
       errs() << "Branch marks end of else block, need to close braces!\n";
       Out << "/* Branch marks end of else block, need to close braces! */\n";
@@ -3875,13 +4258,11 @@ void CWriter::visitPHINode(PHINode &I) {
   if (LInductionVars.find(&I) == LInductionVars.end()) {
     writeOperand(&I);
     Out << "__PHI_TEMPORARY";
-  } 
-  else { 
-    //    DEBUG(errs() << "Skipping PHI node for induction variable!\n"); 
+  } else {
+    //    DEBUG(errs() << "Skipping PHI node for induction variable!\n");
   }
 }
 
-
 // NOTE: Moving LLVM-4 Binary Op functions here
 bool isNeg(const Value *V) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
@@ -3902,13 +4283,12 @@ bool isFNeg(const Value *V, bool IgnoreZeroSign) {
   return false;
 }
 
-
 Value *getNegArgument(Value *BinOp) {
   return cast<BinaryOperator>(BinOp)->getOperand(1);
 }
 
 const Value *getNegArgument(const Value *BinOp) {
-  return getNegArgument(const_cast<Value*>(BinOp));
+  return getNegArgument(const_cast<Value *>(BinOp));
 }
 
 Value *getFNegArgument(Value *BinOp) {
@@ -3916,7 +4296,7 @@ Value *getFNegArgument(Value *BinOp) {
 }
 
 const Value *getFNegArgument(const Value *BinOp) {
-  return getFNegArgument(const_cast<Value*>(BinOp));
+  return getFNegArgument(const_cast<Value *>(BinOp));
 }
 
 static inline bool isConstantAllOnes(const Value *V) {
@@ -3928,32 +4308,27 @@ static inline bool isConstantAllOnes(const Value *V) {
 bool isNot(const Value *V) {
   if (const BinaryOperator *Bop = dyn_cast<BinaryOperator>(V))
     return (Bop->getOpcode() == Instruction::Xor &&
-        (isConstantAllOnes(Bop->getOperand(1)) ||
-         isConstantAllOnes(Bop->getOperand(0))));
+            (isConstantAllOnes(Bop->getOperand(1)) ||
+             isConstantAllOnes(Bop->getOperand(0))));
   return false;
 }
 
-
 Value *getNotArgument(Value *BinOp) {
   assert(isNot(BinOp) && "getNotArgument on non-'not' instruction!");
   BinaryOperator *BO = cast<BinaryOperator>(BinOp);
   Value *Op0 = BO->getOperand(0);
   Value *Op1 = BO->getOperand(1);
-  if (isConstantAllOnes(Op0)) return Op1;
+  if (isConstantAllOnes(Op0))
+    return Op1;
 
   assert(isConstantAllOnes(Op1));
   return Op0;
 }
 
 const Value *getNotArgument(const Value *BinOp) {
-  return getNotArgument(const_cast<Value*>(BinOp));
+  return getNotArgument(const_cast<Value *>(BinOp));
 }
 
-
-
-
-
-
 void CWriter::visitBinaryOperator(BinaryOperator &I) {
   // binary instructions, shift instructions, setCond instructions.
   assert(!I.getType()->isPointerTy());
@@ -3979,7 +4354,8 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) {
   //   DEBUG(
   //       if(needsCast) errs() << "****Needs Cast: \n" << I << "\n";
   //       else if(shouldCast) errs() << "****Should Cast: \n" << I << "\n";
-  //       else if(I.getType()->isVectorTy()) errs() << "****Is Vector Type: \n" << I << "\n";
+  //       else if(I.getType()->isVectorTy()) errs() << "****Is Vector Type: \n"
+  //       << I << "\n";
   //       );
   //
   //    Type *VTy = I.getOperand(0)->getType();
@@ -4019,13 +4395,13 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) {
   // If this is a negation operation, print it out as such.  For FP, we don't
   // want to print "-0.0 - X".
 
-  //if (BinaryOperator::isNeg(&I)) {
+  // if (BinaryOperator::isNeg(&I)) {
   if (isNeg(&I)) {
     Out << "-(";
     writeOperand(getNegArgument(&I));
     Out << ")";
   }
-  //else if (BinaryOperator::isFNeg(&I)) {
+  // else if (BinaryOperator::isFNeg(&I)) {
   else if (isFNeg(&I, true)) {
     Out << "-(";
     writeOperand(getFNegArgument(&I));
@@ -4040,7 +4416,7 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) {
       Out << "fmodf(";
     else if (I.getType() == Type::getDoubleTy(I.getContext()))
       Out << "fmod(";
-    else  // all 3 flavors of long double
+    else // all 3 flavors of long double
       Out << "fmodl(";
     writeOperand(I.getOperand(0), ContextCasted);
     Out << ", ";
@@ -4058,29 +4434,49 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) {
     writeOperandWithCast(I.getOperand(0), I.getOpcode());
 
     switch (I.getOpcode()) {
-      case Instruction::Add:
-      case Instruction::FAdd: Out << " + "; break;
-      case Instruction::Sub:
-      case Instruction::FSub: Out << " - "; break;
-      case Instruction::Mul:
-      case Instruction::FMul: Out << " * "; break;
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem: Out << " % "; break;
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv: Out << " / "; break;
-      case Instruction::And:  Out << " & "; break;
-      case Instruction::Or:   Out << " | "; break;
-      case Instruction::Xor:  Out << " ^ "; break;
-      case Instruction::Shl : Out << " << "; break;
-      case Instruction::LShr:
-      case Instruction::AShr: Out << " >> "; break;
-      default:
+    case Instruction::Add:
+    case Instruction::FAdd:
+      Out << " + ";
+      break;
+    case Instruction::Sub:
+    case Instruction::FSub:
+      Out << " - ";
+      break;
+    case Instruction::Mul:
+    case Instruction::FMul:
+      Out << " * ";
+      break;
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+      Out << " % ";
+      break;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+      Out << " / ";
+      break;
+    case Instruction::And:
+      Out << " & ";
+      break;
+    case Instruction::Or:
+      Out << " | ";
+      break;
+    case Instruction::Xor:
+      Out << " ^ ";
+      break;
+    case Instruction::Shl:
+      Out << " << ";
+      break;
+    case Instruction::LShr:
+    case Instruction::AShr:
+      Out << " >> ";
+      break;
+    default:
 #ifndef NDEBUG
-                              errs() << "Invalid operator type!" << I;
+      errs() << "Invalid operator type!" << I;
 #endif
-                              llvm_unreachable(0);
+      llvm_unreachable(0);
     }
 
     writeOperandWithCast(I.getOperand(1), I.getOpcode());
@@ -4090,8 +4486,8 @@ void CWriter::visitBinaryOperator(BinaryOperator &I) {
 }
 
 void CWriter::visitICmpInst(ICmpInst &I) {
-  if (I.getType()->isVectorTy()
-      || I.getOperand(0)->getType()->getPrimitiveSizeInBits() > 64) {
+  if (I.getType()->isVectorTy() ||
+      I.getOperand(0)->getType()->getPrimitiveSizeInBits() > 64) {
     Out << "llvm_icmp_" << getCmpPredicateName(I.getPredicate()) << "_";
     printTypeString(Out, I.getOperand(0)->getType(), I.isSigned());
     Out << "(";
@@ -4100,8 +4496,10 @@ void CWriter::visitICmpInst(ICmpInst &I) {
     writeOperand(I.getOperand(1), ContextCasted);
     Out << ")";
     if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) {
-      CmpDeclTypes.insert(std::pair<CmpInst::Predicate, VectorType*>(I.getPredicate(), VTy));
-      TypedefDeclTypes.insert(I.getType()); // insert type not necessarily visible above
+      CmpDeclTypes.insert(
+          std::pair<CmpInst::Predicate, VectorType *>(I.getPredicate(), VTy));
+      TypedefDeclTypes.insert(
+          I.getType()); // insert type not necessarily visible above
     }
     return;
   }
@@ -4116,21 +4514,33 @@ void CWriter::visitICmpInst(ICmpInst &I) {
   writeOperandWithCast(I.getOperand(0), I);
 
   switch (I.getPredicate()) {
-    case ICmpInst::ICMP_EQ:  Out << " == "; break;
-    case ICmpInst::ICMP_NE:  Out << " != "; break;
-    case ICmpInst::ICMP_ULE:
-    case ICmpInst::ICMP_SLE: Out << " <= "; break;
-    case ICmpInst::ICMP_UGE:
-    case ICmpInst::ICMP_SGE: Out << " >= "; break;
-    case ICmpInst::ICMP_ULT:
-    case ICmpInst::ICMP_SLT: Out << " < "; break;
-    case ICmpInst::ICMP_UGT:
-    case ICmpInst::ICMP_SGT: Out << " > "; break;
-    default:
+  case ICmpInst::ICMP_EQ:
+    Out << " == ";
+    break;
+  case ICmpInst::ICMP_NE:
+    Out << " != ";
+    break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE:
+    Out << " <= ";
+    break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+    Out << " >= ";
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    Out << " < ";
+    break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    Out << " > ";
+    break;
+  default:
 #ifndef NDEBUG
-                             errs() << "Invalid icmp predicate!" << I;
+    errs() << "Invalid icmp predicate!" << I;
 #endif
-                             llvm_unreachable(0);
+    llvm_unreachable(0);
   }
 
   writeOperandWithCast(I.getOperand(1), I);
@@ -4148,8 +4558,10 @@ void CWriter::visitFCmpInst(FCmpInst &I) {
     writeOperand(I.getOperand(1), ContextCasted);
     Out << ")";
     if (VectorType *VTy = dyn_cast<VectorType>(I.getOperand(0)->getType())) {
-      CmpDeclTypes.insert(std::pair<CmpInst::Predicate, VectorType*>(I.getPredicate(), VTy));
-      TypedefDeclTypes.insert(I.getType()); // insert type not necessarily visible above
+      CmpDeclTypes.insert(
+          std::pair<CmpInst::Predicate, VectorType *>(I.getPredicate(), VTy));
+      TypedefDeclTypes.insert(
+          I.getType()); // insert type not necessarily visible above
     }
     return;
   }
@@ -4163,18 +4575,21 @@ void CWriter::visitFCmpInst(FCmpInst &I) {
   Out << ")";
 }
 
-static const char * getFloatBitCastField(Type *Ty) {
+static const char *getFloatBitCastField(Type *Ty) {
   switch (Ty->getTypeID()) {
-    default: llvm_unreachable("Invalid Type");
-    case Type::FloatTyID:  return "Float";
-    case Type::DoubleTyID: return "Double";
-    case Type::IntegerTyID: {
-                              unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
-                              if (NumBits <= 32)
-                                return "Int32";
-                              else
-                                return "Int64";
-                            }
+  default:
+    llvm_unreachable("Invalid Type");
+  case Type::FloatTyID:
+    return "Float";
+  case Type::DoubleTyID:
+    return "Double";
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits <= 32)
+      return "Int32";
+    else
+      return "Int64";
+  }
   }
 }
 
@@ -4183,9 +4598,9 @@ void CWriter::visitCastInst(CastInst &I) {
   Type *DstTy = I.getType();
   Type *SrcTy = I.getOperand(0)->getType();
 
-  if (DstTy->isVectorTy() || SrcTy->isVectorTy()
-      || DstTy->getPrimitiveSizeInBits() > 64
-      || SrcTy->getPrimitiveSizeInBits() > 64) {
+  if (DstTy->isVectorTy() || SrcTy->isVectorTy() ||
+      DstTy->getPrimitiveSizeInBits() > 64 ||
+      SrcTy->getPrimitiveSizeInBits() > 64) {
     Out << "llvm_" << I.getOpcodeName() << "_";
     printTypeString(Out, SrcTy, false);
     Out << "_";
@@ -4193,7 +4608,9 @@ void CWriter::visitCastInst(CastInst &I) {
     Out << "(";
     writeOperand(I.getOperand(0), ContextCasted);
     Out << ")";
-    CastOpDeclTypes.insert(std::pair<Instruction::CastOps, std::pair<Type*, Type*> >(I.getOpcode(), std::pair<Type*, Type*>(SrcTy, DstTy)));
+    CastOpDeclTypes.insert(
+        std::pair<Instruction::CastOps, std::pair<Type *, Type *>>(
+            I.getOpcode(), std::pair<Type *, Type *>(SrcTy, DstTy)));
     return;
   }
 
@@ -4201,10 +4618,10 @@ void CWriter::visitCastInst(CastInst &I) {
     Out << '(';
     // These int<->float and long<->double casts need to be handled specially
     Out << GetValueName(&I) << "__BITCAST_TEMPORARY."
-      << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
+        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
     writeOperand(I.getOperand(0), ContextCasted);
     Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
-      << getFloatBitCastField(I.getType());
+        << getFloatBitCastField(I.getType());
     Out << ')';
     return;
   }
@@ -4241,15 +4658,16 @@ void CWriter::visitSelectInst(SelectInst &I) {
   writeOperand(I.getFalseValue(), ContextCasted);
   Out << ")";
   SelectDeclTypes.insert(I.getType());
-  assert(I.getCondition()->getType()->isVectorTy() == I.getType()->isVectorTy()); // TODO: might be scalarty == vectorty
+  assert(I.getCondition()->getType()->isVectorTy() ==
+         I.getType()->isVectorTy()); // TODO: might be scalarty == vectorty
 }
 
 // Returns the macro name or value of the max or min of an integer type
 // (as defined in limits.h).
 static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax,
-    raw_ostream &Out) {
-  const char* type;
-  const char* sprefix = "";
+                            raw_ostream &Out) {
+  const char *type;
+  const char *sprefix = "";
 
   unsigned NumBits = Ty.getBitWidth();
   if (NumBits <= 8) {
@@ -4274,37 +4692,38 @@ static void printLimitValue(IntegerType &Ty, bool isSigned, bool isMax,
 #ifndef NDEBUG
 static bool isSupportedIntegerSize(IntegerType &T) {
   return T.getBitWidth() == 8 || T.getBitWidth() == 16 ||
-    T.getBitWidth() == 32 || T.getBitWidth() == 64 ||
-    T.getBitWidth() == 128;
+         T.getBitWidth() == 32 || T.getBitWidth() == 64 ||
+         T.getBitWidth() == 128;
 }
 #endif
 
-void CWriter::printIntrinsicDefinition(FunctionType *funT,
-    unsigned Opcode, std::string OpName, raw_ostream &Out) {
+void CWriter::printIntrinsicDefinition(FunctionType *funT, unsigned Opcode,
+                                       std::string OpName, raw_ostream &Out) {
   Type *retT = funT->getReturnType();
   Type *elemT = funT->getParamType(0);
   IntegerType *elemIntT = dyn_cast<IntegerType>(elemT);
   char i, numParams = funT->getNumParams();
   bool isSigned;
   switch (Opcode) {
-    default:
-      isSigned = false;
-      break;
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::ssub_with_overflow:
-    case Intrinsic::smul_with_overflow:
-      isSigned = true;
-      break;
+  default:
+    isSigned = false;
+    break;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+    isSigned = true;
+    break;
   }
   assert(numParams > 0 && numParams < 26);
 
   if (isa<VectorType>(retT)) {
     // this looks general, but is only actually used for ctpop, ctlz, cttz
-    Type* *devecFunParams = (Type**)alloca(sizeof(Type*) * numParams);
+    Type **devecFunParams = (Type **)alloca(sizeof(Type *) * numParams);
     for (i = 0; i < numParams; i++) {
       devecFunParams[(int)i] = funT->params()[(int)i]->getScalarType();
     }
-    FunctionType *devecFunT = FunctionType::get(funT->getReturnType()->getScalarType(),
+    FunctionType *devecFunT = FunctionType::get(
+        funT->getReturnType()->getScalarType(),
         makeArrayRef(devecFunParams, numParams), funT->isVarArg());
     printIntrinsicDefinition(devecFunT, Opcode, OpName + "_devec", Out);
   }
@@ -4321,19 +4740,20 @@ void CWriter::printIntrinsicDefinition(FunctionType *funT,
   Out << "(";
   for (i = 0; i < numParams; i++) {
     switch (Opcode) {
-      // optional intrinsic validity assertion checks
-      default:
-        // default case: assume all parameters must have the same type
-        assert(elemT == funT->getParamType(i));
-        break;
-      case Intrinsic::ctlz:
-      case Intrinsic::cttz:
-      case Intrinsic::powi:
-        break;
+    // optional intrinsic validity assertion checks
+    default:
+      // default case: assume all parameters must have the same type
+      assert(elemT == funT->getParamType(i));
+      break;
+    case Intrinsic::ctlz:
+    case Intrinsic::cttz:
+    case Intrinsic::powi:
+      break;
     }
     printTypeNameUnaligned(Out, funT->getParamType(i), isSigned);
     Out << " " << (char)('a' + i);
-    if (i != numParams - 1) Out << ", ";
+    if (i != numParams - 1)
+      Out << ", ";
   }
   Out << ") {\n  ";
   printTypeName(Out, retT);
@@ -4346,106 +4766,106 @@ void CWriter::printIntrinsicDefinition(FunctionType *funT,
         Out << (char)('a' + j);
         if (isa<VectorType>(funT->params()[j]))
           Out << ".vector[" << (int)i << "]";
-        if (j != numParams - 1) Out << ", ";
+        if (j != numParams - 1)
+          Out << ", ";
       }
       Out << ");\n";
     }
-  }
-  else if (elemIntT) {
+  } else if (elemIntT) {
     // handle integer ops
     assert(isSupportedIntegerSize(*elemIntT) &&
-        "CBackend does not support arbitrary size integers.");
+           "CBackend does not support arbitrary size integers.");
     switch (Opcode) {
-      default:
+    default:
 #ifndef NDEBUG
-        errs() << "Unsupported Intrinsic!" << Opcode;
+      errs() << "Unsupported Intrinsic!" << Opcode;
 #endif
-        llvm_unreachable(0);
+      llvm_unreachable(0);
 
-      case Intrinsic::uadd_with_overflow:
-        //   r.field0 = a + b;
-        //   r.field1 = (r.field0 < a);
-        assert(cast<StructType>(retT)->getElementType(0) == elemT);
-        Out << "  r.field0 = a + b;\n";
-        Out << "  r.field1 = (a >= -b);\n";
-        break;
+    case Intrinsic::uadd_with_overflow:
+      //   r.field0 = a + b;
+      //   r.field1 = (r.field0 < a);
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a + b;\n";
+      Out << "  r.field1 = (a >= -b);\n";
+      break;
 
-      case Intrinsic::sadd_with_overflow:
-        //   r.field0 = a + b;
-        //   r.field1 = (b > 0 && a > XX_MAX - b) ||
-        //              (b < 0 && a < XX_MIN - b);
-        assert(cast<StructType>(retT)->getElementType(0) == elemT);
-        Out << "  r.field0 = a + b;\n";
-        Out << "  r.field1 = (b >= 0 ? a > ";
-        printLimitValue(*elemIntT, true, true, Out);
-        Out << " - b : a < ";
-        printLimitValue(*elemIntT, true, false, Out);
-        Out << " - b);\n";
-        break;
+    case Intrinsic::sadd_with_overflow:
+      //   r.field0 = a + b;
+      //   r.field1 = (b > 0 && a > XX_MAX - b) ||
+      //              (b < 0 && a < XX_MIN - b);
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a + b;\n";
+      Out << "  r.field1 = (b >= 0 ? a > ";
+      printLimitValue(*elemIntT, true, true, Out);
+      Out << " - b : a < ";
+      printLimitValue(*elemIntT, true, false, Out);
+      Out << " - b);\n";
+      break;
 
-      case Intrinsic::usub_with_overflow:
-        assert(cast<StructType>(retT)->getElementType(0) == elemT);
-        Out << "  r.field0 = a - b;\n";
-        Out << "  r.field1 = (a < b);\n";
-        break;
+    case Intrinsic::usub_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a - b;\n";
+      Out << "  r.field1 = (a < b);\n";
+      break;
 
-      case Intrinsic::ssub_with_overflow:
-        assert(cast<StructType>(retT)->getElementType(0) == elemT);
-        Out << "  r.field0 = a - b;\n";
-        Out << "  r.field1 = (b <= 0 ? a > ";
-        printLimitValue(*elemIntT, true, true, Out);
-        Out << " + b : a < ";
-        printLimitValue(*elemIntT, true, false, Out);
-        Out << " + b);\n";
-        break;
+    case Intrinsic::ssub_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field0 = a - b;\n";
+      Out << "  r.field1 = (b <= 0 ? a > ";
+      printLimitValue(*elemIntT, true, true, Out);
+      Out << " + b : a < ";
+      printLimitValue(*elemIntT, true, false, Out);
+      Out << " + b);\n";
+      break;
 
-      case Intrinsic::umul_with_overflow:
-        assert(cast<StructType>(retT)->getElementType(0) == elemT);
-        Out << "  r.field1 = LLVMMul_uov(8 * sizeof(a), &a, &b, &r.field0);\n";
-        break;
+    case Intrinsic::umul_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field1 = LLVMMul_uov(8 * sizeof(a), &a, &b, &r.field0);\n";
+      break;
 
-      case Intrinsic::smul_with_overflow:
-        assert(cast<StructType>(retT)->getElementType(0) == elemT);
-        Out << "  r.field1 = LLVMMul_sov(8 * sizeof(a), &a, &b, &r.field0);\n";
-        break;
+    case Intrinsic::smul_with_overflow:
+      assert(cast<StructType>(retT)->getElementType(0) == elemT);
+      Out << "  r.field1 = LLVMMul_sov(8 * sizeof(a), &a, &b, &r.field0);\n";
+      break;
 
-      case Intrinsic::bswap:
-        assert(retT == elemT);
-        Out << "  LLVMFlipAllBits(8 * sizeof(a), &a, &r);\n";
-        break;
+    case Intrinsic::bswap:
+      assert(retT == elemT);
+      Out << "  LLVMFlipAllBits(8 * sizeof(a), &a, &r);\n";
+      break;
 
-      case Intrinsic::ctpop:
-        assert(retT == elemT);
-        Out << "  r = ";
-        if (retT->getPrimitiveSizeInBits() > 64)
-          Out << "llvm_ctor_u128(0, ";
-        Out << "LLVMCountPopulation(8 * sizeof(a), &a)";
-        if (retT->getPrimitiveSizeInBits() > 64)
-          Out << ")";
-        Out << ";\n";
-        break;
+    case Intrinsic::ctpop:
+      assert(retT == elemT);
+      Out << "  r = ";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << "llvm_ctor_u128(0, ";
+      Out << "LLVMCountPopulation(8 * sizeof(a), &a)";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << ")";
+      Out << ";\n";
+      break;
 
-      case Intrinsic::ctlz:
-        assert(retT == elemT);
-        Out << "  (void)b;\n  r = ";
-        if (retT->getPrimitiveSizeInBits() > 64)
-          Out << "llvm_ctor_u128(0, ";
-        Out << "LLVMCountLeadingZeros(8 * sizeof(a), &a)";
-        if (retT->getPrimitiveSizeInBits() > 64)
-          Out << ")";
-        Out << ";\n";
-        break;
+    case Intrinsic::ctlz:
+      assert(retT == elemT);
+      Out << "  (void)b;\n  r = ";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << "llvm_ctor_u128(0, ";
+      Out << "LLVMCountLeadingZeros(8 * sizeof(a), &a)";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << ")";
+      Out << ";\n";
+      break;
 
-      case Intrinsic::cttz:
-        assert(retT == elemT);
-        Out << "  (void)b;\n  r = ";
-        if (retT->getPrimitiveSizeInBits() > 64)
-          Out << "llvm_ctor_u128(0, ";
-        Out << "LLVMCountTrailingZeros(8 * sizeof(a), &a)";
-        if (retT->getPrimitiveSizeInBits() > 64)
-          Out << ")";
-        Out << ";\n";
-        break;
+    case Intrinsic::cttz:
+      assert(retT == elemT);
+      Out << "  (void)b;\n  r = ";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << "llvm_ctor_u128(0, ";
+      Out << "LLVMCountTrailingZeros(8 * sizeof(a), &a)";
+      if (retT->getPrimitiveSizeInBits() > 64)
+        Out << ")";
+      Out << ";\n";
+      break;
     }
 
   } else {
@@ -4468,49 +4888,48 @@ void CWriter::printIntrinsicDefinition(FunctionType *funT,
     }
 
     switch (Opcode) {
-      default:
+    default:
 #ifndef NDEBUG
-        errs() << "Unsupported Intrinsic!" << Opcode;
+      errs() << "Unsupported Intrinsic!" << Opcode;
 #endif
-        llvm_unreachable(0);
-
-      case Intrinsic::ceil:
-        Out << "  r = ceil" << suffix << "(a);\n";
-        break;
+      llvm_unreachable(0);
 
-      case Intrinsic::fabs:
-        Out << "  r = fabs" << suffix << "(a);\n";
-        break;
+    case Intrinsic::ceil:
+      Out << "  r = ceil" << suffix << "(a);\n";
+      break;
 
-      case Intrinsic::floor:
-        Out << "  r = floor" << suffix << "(a);\n";
-        break;
+    case Intrinsic::fabs:
+      Out << "  r = fabs" << suffix << "(a);\n";
+      break;
 
-      case Intrinsic::fma:
-        Out << "  r = fma" << suffix << "(a, b, c);\n";
-        break;
+    case Intrinsic::floor:
+      Out << "  r = floor" << suffix << "(a);\n";
+      break;
 
-      case Intrinsic::fmuladd:
-        Out << "  r = a * b + c;\n";
-        break;
+    case Intrinsic::fma:
+      Out << "  r = fma" << suffix << "(a, b, c);\n";
+      break;
 
-      case Intrinsic::pow:
-      case Intrinsic::powi:
-        Out << "  r = pow" << suffix << "(a, b);\n";
-        break;
+    case Intrinsic::fmuladd:
+      Out << "  r = a * b + c;\n";
+      break;
 
-      case Intrinsic::rint:
-        Out << "  r = rint" << suffix << "(a);\n";
-        break;
+    case Intrinsic::pow:
+    case Intrinsic::powi:
+      Out << "  r = pow" << suffix << "(a, b);\n";
+      break;
 
-      case Intrinsic::sqrt:
-        Out << "  r = sqrt" << suffix << "(a);\n";
-        break;
+    case Intrinsic::rint:
+      Out << "  r = rint" << suffix << "(a);\n";
+      break;
 
-      case Intrinsic::trunc:
-        Out << "  r = trunc" << suffix << "(a);\n";
-        break;
+    case Intrinsic::sqrt:
+      Out << "  r = sqrt" << suffix << "(a);\n";
+      break;
 
+    case Intrinsic::trunc:
+      Out << "  r = trunc" << suffix << "(a);\n";
+      break;
     }
   }
 
@@ -4528,73 +4947,74 @@ void CWriter::lowerIntrinsics(Function &F) {
   // Examine all the instructions in this function to find the intrinsics that
   // need to be lowered.
   for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;)
       if (CallInst *CI = dyn_cast<CallInst>(I++))
         if (Function *F = CI->getCalledFunction())
           switch (F->getIntrinsicID()) {
-            case Intrinsic::not_intrinsic:
-            case Intrinsic::vastart:
-            case Intrinsic::vacopy:
-            case Intrinsic::vaend:
-            case Intrinsic::returnaddress:
-            case Intrinsic::frameaddress:
-            case Intrinsic::setjmp:
-            case Intrinsic::longjmp:
-            case Intrinsic::sigsetjmp:
-            case Intrinsic::siglongjmp:
-            case Intrinsic::prefetch:
-            case Intrinsic::x86_sse_cmp_ss:
-            case Intrinsic::x86_sse_cmp_ps:
-            case Intrinsic::x86_sse2_cmp_sd:
-            case Intrinsic::x86_sse2_cmp_pd:
-            case Intrinsic::ppc_altivec_lvsl:
-            case Intrinsic::uadd_with_overflow:
-            case Intrinsic::sadd_with_overflow:
-            case Intrinsic::usub_with_overflow:
-            case Intrinsic::ssub_with_overflow:
-            case Intrinsic::umul_with_overflow:
-            case Intrinsic::smul_with_overflow:
-            case Intrinsic::bswap:
-            case Intrinsic::ceil:
-            case Intrinsic::ctlz:
-            case Intrinsic::ctpop:
-            case Intrinsic::cttz:
-            case Intrinsic::fabs:
-            case Intrinsic::floor:
-            case Intrinsic::fma:
-            case Intrinsic::fmuladd:
-            case Intrinsic::pow:
-            case Intrinsic::powi:
-            case Intrinsic::rint:
-            case Intrinsic::sqrt:
-            case Intrinsic::trunc:
-            case Intrinsic::trap:
-            case Intrinsic::stackprotector:
-            case Intrinsic::dbg_value:
-            case Intrinsic::dbg_declare:
-              // We directly implement these intrinsics
-              break;
-            default:
-              // All other intrinsic calls we must lower.
-              BasicBlock::iterator Before = E;
-              if (CI != &BB->front())
-                Before = std::prev(BasicBlock::iterator(CI));
-
-              IL->LowerIntrinsicCall(CI);
-              if (Before != E) {        // Move iterator to instruction after call
-                I = Before; ++I;
-              } else {
-                I = BB->begin();
-              }
-              // If the intrinsic got lowered to another call, and that call has
-              // a definition then we need to make sure its prototype is emitted
-              // before any calls to it.
-              if (CallInst *Call = dyn_cast<CallInst>(I))
-                if (Function *NewF = Call->getCalledFunction())
-                  if (!NewF->isDeclaration())
-                    prototypesToGen.push_back(NewF);
-
-              break;
+          case Intrinsic::not_intrinsic:
+          case Intrinsic::vastart:
+          case Intrinsic::vacopy:
+          case Intrinsic::vaend:
+          case Intrinsic::returnaddress:
+          case Intrinsic::frameaddress:
+          case Intrinsic::setjmp:
+          case Intrinsic::longjmp:
+          case Intrinsic::sigsetjmp:
+          case Intrinsic::siglongjmp:
+          case Intrinsic::prefetch:
+          case Intrinsic::x86_sse_cmp_ss:
+          case Intrinsic::x86_sse_cmp_ps:
+          case Intrinsic::x86_sse2_cmp_sd:
+          case Intrinsic::x86_sse2_cmp_pd:
+          case Intrinsic::ppc_altivec_lvsl:
+          case Intrinsic::uadd_with_overflow:
+          case Intrinsic::sadd_with_overflow:
+          case Intrinsic::usub_with_overflow:
+          case Intrinsic::ssub_with_overflow:
+          case Intrinsic::umul_with_overflow:
+          case Intrinsic::smul_with_overflow:
+          case Intrinsic::bswap:
+          case Intrinsic::ceil:
+          case Intrinsic::ctlz:
+          case Intrinsic::ctpop:
+          case Intrinsic::cttz:
+          case Intrinsic::fabs:
+          case Intrinsic::floor:
+          case Intrinsic::fma:
+          case Intrinsic::fmuladd:
+          case Intrinsic::pow:
+          case Intrinsic::powi:
+          case Intrinsic::rint:
+          case Intrinsic::sqrt:
+          case Intrinsic::trunc:
+          case Intrinsic::trap:
+          case Intrinsic::stackprotector:
+          case Intrinsic::dbg_value:
+          case Intrinsic::dbg_declare:
+            // We directly implement these intrinsics
+            break;
+          default:
+            // All other intrinsic calls we must lower.
+            BasicBlock::iterator Before = E;
+            if (CI != &BB->front())
+              Before = std::prev(BasicBlock::iterator(CI));
+
+            IL->LowerIntrinsicCall(CI);
+            if (Before != E) { // Move iterator to instruction after call
+              I = Before;
+              ++I;
+            } else {
+              I = BB->begin();
+            }
+            // If the intrinsic got lowered to another call, and that call has
+            // a definition then we need to make sure its prototype is emitted
+            // before any calls to it.
+            if (CallInst *Call = dyn_cast<CallInst>(I))
+              if (Function *NewF = Call->getCalledFunction())
+                if (!NewF->isDeclaration())
+                  prototypesToGen.push_back(NewF);
+
+            break;
           }
 }
 
@@ -4610,8 +5030,8 @@ void CWriter::visitCallInst(CallInst &I) {
 
   Value *Callee = I.getCalledValue();
 
-  PointerType  *PTy   = cast<PointerType>(Callee->getType());
-  FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
+  PointerType *PTy = cast<PointerType>(Callee->getType());
+  FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
 
   // If this is a call to a struct-return function, assign to the first
   // parameter instead of passing it to the call.
@@ -4625,11 +5045,14 @@ void CWriter::visitCallInst(CallInst &I) {
     Out << " = ";
   }
 
-  if (I.isTailCall()) Out << " /*tail*/ ";
+  if (I.isTailCall())
+    Out << " /*tail*/ ";
 
   // If this is an indirect call to a struct return function, we need to cast
   // the pointer. Ditto for indirect calls with byval arguments.
-  bool NeedsCast = (hasByVal || isStructRet || I.getCallingConv() != CallingConv::C) && !isa<Function>(Callee);
+  bool NeedsCast =
+      (hasByVal || isStructRet || I.getCallingConv() != CallingConv::C) &&
+      !isa<Function>(Callee);
 
   // GCC is a real PITA.  It does not permit codegening casts of functions to
   // function pointers if they are in a call (it generates a trap instruction
@@ -4653,11 +5076,13 @@ void CWriter::visitCallInst(CallInst &I) {
   if (NeedsCast) {
     // Ok, just cast the pointer type.
     Out << "((";
-    printTypeName(Out, I.getCalledValue()->getType()->getPointerElementType(), false, std::make_pair(PAL, I.getCallingConv()));
+    printTypeName(Out, I.getCalledValue()->getType()->getPointerElementType(),
+                  false, std::make_pair(PAL, I.getCallingConv()));
     Out << "*)(void*)";
   }
   writeOperand(Callee, ContextCasted);
-  if (NeedsCast) Out << ')';
+  if (NeedsCast)
+    Out << ')';
 
   Out << '(';
 
@@ -4671,7 +5096,7 @@ void CWriter::visitCallInst(CallInst &I) {
   CallSite CS(&I);
   CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
   unsigned ArgNo = 0;
-  if (isStructRet) {   // Skip struct return argument.
+  if (isStructRet) { // Skip struct return argument.
     ++AI;
     ++ArgNo;
   }
@@ -4689,16 +5114,18 @@ void CWriter::visitCallInst(CallInst &I) {
   }
 
   for (; AI != AE; ++AI, ++ArgNo) {
-    if (PrintedArg) Out << ", ";
+    if (PrintedArg)
+      Out << ", ";
     if (ArgNo < NumDeclaredParams &&
         (*AI)->getType() != FTy->getParamType(ArgNo)) {
       Out << '(';
-      printTypeNameUnaligned(Out, FTy->getParamType(ArgNo),
-          /*isSigned=*/PAL.hasAttribute(ArgNo+1, Attribute::SExt));
+      printTypeNameUnaligned(
+          Out, FTy->getParamType(ArgNo),
+          /*isSigned=*/PAL.hasAttribute(ArgNo + 1, Attribute::SExt));
       Out << ')';
     }
     // Check if the argument is expected to be passed by value.
-    if (I.getAttributes().hasAttribute(ArgNo+1, Attribute::ByVal))
+    if (I.getAttributes().hasAttribute(ArgNo + 1, Attribute::ByVal))
       writeOperandDeref(*AI);
     else
       writeOperand(*AI, ContextCasted);
@@ -4712,175 +5139,191 @@ void CWriter::visitCallInst(CallInst &I) {
 bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID) {
 
   switch (ID) {
-    default: {
+  default: {
 #ifndef NDEBUG
-               errs() << "Unknown LLVM intrinsic! " << I;
+    errs() << "Unknown LLVM intrinsic! " << I;
 #endif
-               llvm_unreachable(0);
-               return false;
-             }
-
-    case Intrinsic::dbg_value:
-    case Intrinsic::dbg_declare:
-             return true; // ignore these intrinsics
-    case Intrinsic::vastart:
-             Out << "0; ";
-
-             Out << "va_start(*(va_list*)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ", ";
-             // Output the last argument to the enclosing function.
-             if (I.getParent()->getParent()->arg_empty())
-               Out << "vararg_dummy_arg";
-             else
-               writeOperand(&*(I.getParent()->getParent()->arg_end() - 1));
-             Out << ')';
-             return true;
-    case Intrinsic::vaend:
-             if (!isa<ConstantPointerNull>(I.getArgOperand(0))) {
-               Out << "0; va_end(*(va_list*)";
-               writeOperand(I.getArgOperand(0), ContextCasted);
-               Out << ')';
-             } else {
-               Out << "va_end(*(va_list*)0)";
-             }
-             return true;
-    case Intrinsic::vacopy:
-             Out << "0; ";
-             Out << "va_copy(*(va_list*)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ", *(va_list*)";
-             writeOperand(I.getArgOperand(1), ContextCasted);
-             Out << ')';
-             return true;
-    case Intrinsic::returnaddress:
-             Out << "__builtin_return_address(";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ')';
-             return true;
-    case Intrinsic::frameaddress:
-             Out << "__builtin_frame_address(";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ')';
-             return true;
-    case Intrinsic::setjmp:
-             Out << "setjmp(*(jmp_buf*)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ')';
-             return true;
-    case Intrinsic::longjmp:
-             Out << "longjmp(*(jmp_buf*)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ", ";
-             writeOperand(I.getArgOperand(1), ContextCasted);
-             Out << ')';
-             return true;
-    case Intrinsic::sigsetjmp:
-             Out << "sigsetjmp(*(sigjmp_buf*)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ',';
-             writeOperand(I.getArgOperand(1), ContextCasted);
-             Out << ')';
-             return true;
-    case Intrinsic::siglongjmp:
-             Out << "siglongjmp(*(sigjmp_buf*)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ", ";
-             writeOperand(I.getArgOperand(1), ContextCasted);
-             Out << ')';
-             return true;
-    case Intrinsic::prefetch:
-             Out << "LLVM_PREFETCH((const void *)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ", ";
-             writeOperand(I.getArgOperand(1), ContextCasted);
-             Out << ", ";
-             writeOperand(I.getArgOperand(2), ContextCasted);
-             Out << ")";
-             return true;
-    case Intrinsic::stacksave:
-             // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
-             // to work around GCC bugs (see PR1809).
-             Out << "0; *((void**)&" << GetValueName(&I)
-               << ") = __builtin_stack_save()";
-             return true;
-    case Intrinsic::x86_sse_cmp_ss:
-    case Intrinsic::x86_sse_cmp_ps:
-    case Intrinsic::x86_sse2_cmp_sd:
-    case Intrinsic::x86_sse2_cmp_pd:
-             Out << '(';
-             printTypeName(Out, I.getType());
-             Out << ')';
-             // Multiple GCC builtins multiplex onto this intrinsic.
-             switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
-               default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
-               case 0: Out << "__builtin_ia32_cmpeq"; break;
-               case 1: Out << "__builtin_ia32_cmplt"; break;
-               case 2: Out << "__builtin_ia32_cmple"; break;
-               case 3: Out << "__builtin_ia32_cmpunord"; break;
-               case 4: Out << "__builtin_ia32_cmpneq"; break;
-               case 5: Out << "__builtin_ia32_cmpnlt"; break;
-               case 6: Out << "__builtin_ia32_cmpnle"; break;
-               case 7: Out << "__builtin_ia32_cmpord"; break;
-             }
-             if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
-               Out << 'p';
-             else
-               Out << 's';
-             if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
-               Out << 's';
-             else
-               Out << 'd';
-
-             Out << "(";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ", ";
-             writeOperand(I.getArgOperand(1), ContextCasted);
-             Out << ")";
-             return true;
-    case Intrinsic::ppc_altivec_lvsl:
-             Out << '(';
-             printTypeName(Out, I.getType());
-             Out << ')';
-             Out << "__builtin_altivec_lvsl(0, (void*)";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             Out << ")";
-             return true;
-    case Intrinsic::stackprotector:
-             writeOperandDeref(I.getArgOperand(1));
-             Out << " = ";
-             writeOperand(I.getArgOperand(0), ContextCasted);
-             return true;
-    case Intrinsic::uadd_with_overflow:
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::usub_with_overflow:
-    case Intrinsic::ssub_with_overflow:
-    case Intrinsic::umul_with_overflow:
-    case Intrinsic::smul_with_overflow:
-    case Intrinsic::bswap:
-    case Intrinsic::ceil:
-    case Intrinsic::ctlz:
-    case Intrinsic::ctpop:
-    case Intrinsic::cttz:
-    case Intrinsic::fabs:
-    case Intrinsic::floor:
-    case Intrinsic::fma:
-    case Intrinsic::fmuladd:
-    case Intrinsic::pow:
-    case Intrinsic::powi:
-    case Intrinsic::rint:
-    case Intrinsic::sqrt:
-    case Intrinsic::trap:
-    case Intrinsic::trunc:
-             return false; // these use the normal function call emission
+    llvm_unreachable(0);
+    return false;
+  }
+
+  case Intrinsic::dbg_value:
+  case Intrinsic::dbg_declare:
+    return true; // ignore these intrinsics
+  case Intrinsic::vastart:
+    Out << "0; ";
+
+    Out << "va_start(*(va_list*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    // Output the last argument to the enclosing function.
+    if (I.getParent()->getParent()->arg_empty())
+      Out << "vararg_dummy_arg";
+    else
+      writeOperand(&*(I.getParent()->getParent()->arg_end() - 1));
+    Out << ')';
+    return true;
+  case Intrinsic::vaend:
+    if (!isa<ConstantPointerNull>(I.getArgOperand(0))) {
+      Out << "0; va_end(*(va_list*)";
+      writeOperand(I.getArgOperand(0), ContextCasted);
+      Out << ')';
+    } else {
+      Out << "va_end(*(va_list*)0)";
+    }
+    return true;
+  case Intrinsic::vacopy:
+    Out << "0; ";
+    Out << "va_copy(*(va_list*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", *(va_list*)";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::returnaddress:
+    Out << "__builtin_return_address(";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::frameaddress:
+    Out << "__builtin_frame_address(";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::setjmp:
+    Out << "setjmp(*(jmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::longjmp:
+    Out << "longjmp(*(jmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::sigsetjmp:
+    Out << "sigsetjmp(*(sigjmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ',';
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::siglongjmp:
+    Out << "siglongjmp(*(sigjmp_buf*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ')';
+    return true;
+  case Intrinsic::prefetch:
+    Out << "LLVM_PREFETCH((const void *)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(2), ContextCasted);
+    Out << ")";
+    return true;
+  case Intrinsic::stacksave:
+    // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
+    // to work around GCC bugs (see PR1809).
+    Out << "0; *((void**)&" << GetValueName(&I) << ") = __builtin_stack_save()";
+    return true;
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse_cmp_ps:
+  case Intrinsic::x86_sse2_cmp_sd:
+  case Intrinsic::x86_sse2_cmp_pd:
+    Out << '(';
+    printTypeName(Out, I.getType());
+    Out << ')';
+    // Multiple GCC builtins multiplex onto this intrinsic.
+    switch (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue()) {
+    default:
+      llvm_unreachable("Invalid llvm.x86.sse.cmp!");
+    case 0:
+      Out << "__builtin_ia32_cmpeq";
+      break;
+    case 1:
+      Out << "__builtin_ia32_cmplt";
+      break;
+    case 2:
+      Out << "__builtin_ia32_cmple";
+      break;
+    case 3:
+      Out << "__builtin_ia32_cmpunord";
+      break;
+    case 4:
+      Out << "__builtin_ia32_cmpneq";
+      break;
+    case 5:
+      Out << "__builtin_ia32_cmpnlt";
+      break;
+    case 6:
+      Out << "__builtin_ia32_cmpnle";
+      break;
+    case 7:
+      Out << "__builtin_ia32_cmpord";
+      break;
+    }
+    if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
+      Out << 'p';
+    else
+      Out << 's';
+    if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
+      Out << 's';
+    else
+      Out << 'd';
+
+    Out << "(";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ", ";
+    writeOperand(I.getArgOperand(1), ContextCasted);
+    Out << ")";
+    return true;
+  case Intrinsic::ppc_altivec_lvsl:
+    Out << '(';
+    printTypeName(Out, I.getType());
+    Out << ')';
+    Out << "__builtin_altivec_lvsl(0, (void*)";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    Out << ")";
+    return true;
+  case Intrinsic::stackprotector:
+    writeOperandDeref(I.getArgOperand(1));
+    Out << " = ";
+    writeOperand(I.getArgOperand(0), ContextCasted);
+    return true;
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::bswap:
+  case Intrinsic::ceil:
+  case Intrinsic::ctlz:
+  case Intrinsic::ctpop:
+  case Intrinsic::cttz:
+  case Intrinsic::fabs:
+  case Intrinsic::floor:
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd:
+  case Intrinsic::pow:
+  case Intrinsic::powi:
+  case Intrinsic::rint:
+  case Intrinsic::sqrt:
+  case Intrinsic::trap:
+  case Intrinsic::trunc:
+    return false; // these use the normal function call emission
   }
 }
 
-//This converts the llvm constraint string to something gcc is expecting.
-//TODO: work out platform independent constraints and factor those out
+// This converts the llvm constraint string to something gcc is expecting.
+// TODO: work out platform independent constraints and factor those out
 //      of the per target tables
 //      handle multiple constraint codes
-std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
+std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo &c) {
   return TargetLowering::AsmOperandInfo(c).ConstraintCode;
 #if 0
   assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
@@ -4917,7 +5360,7 @@ std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
 #endif
 }
 
-//TODO: import logic from AsmPrinter.cpp
+// TODO: import logic from AsmPrinter.cpp
 static std::string gccifyAsm(std::string asmstr) {
   for (std::string::size_type i = 0; i != asmstr.size(); ++i)
     if (asmstr[i] == '\n')
@@ -4928,27 +5371,28 @@ static std::string gccifyAsm(std::string asmstr) {
       if (asmstr[i + 1] == '{') {
         std::string::size_type a = asmstr.find_first_of(':', i + 1);
         std::string::size_type b = asmstr.find_first_of('}', i + 1);
-        std::string n = "%" +
-          asmstr.substr(a + 1, b - a - 1) +
-          asmstr.substr(i + 2, a - i - 2);
+        std::string n = "%" + asmstr.substr(a + 1, b - a - 1) +
+                        asmstr.substr(i + 2, a - i - 2);
         asmstr.replace(i, b - i + 1, n);
         i += n.size() - 1;
       } else
         asmstr.replace(i, 1, "%");
+    } else if (asmstr[i] == '%') // grr
+    {
+      asmstr.replace(i, 1, "%%");
+      ++i;
     }
-    else if (asmstr[i] == '%')//grr
-    { asmstr.replace(i, 1, "%%"); ++i;}
 
   return asmstr;
 }
 
-//TODO: assumptions about what consume arguments from the call are likely wrong
+// TODO: assumptions about what consume arguments from the call are likely wrong
 //      handle communitivity
 void CWriter::visitInlineAsm(CallInst &CI) {
-  InlineAsm* as = cast<InlineAsm>(CI.getCalledValue());
+  InlineAsm *as = cast<InlineAsm>(CI.getCalledValue());
   InlineAsm::ConstraintInfoVector Constraints = as->ParseConstraints();
 
-  std::vector<std::pair<Value*, int> > ResultVals;
+  std::vector<std::pair<Value *, int>> ResultVals;
   if (CI.getType() == Type::getVoidTy(CI.getContext()))
     ;
   else if (StructType *ST = dyn_cast<StructType>(CI.getType())) {
@@ -4967,16 +5411,18 @@ void CWriter::visitInlineAsm(CallInst &CI) {
 
   // Convert over all the output constraints.
   for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-      E = Constraints.end(); I != E; ++I) {
+                                                 E = Constraints.end();
+       I != E; ++I) {
 
     if (I->Type != InlineAsm::isOutput) {
       ++ValueCount;
-      continue;  // Ignore non-output constraints.
+      continue; // Ignore non-output constraints.
     }
 
     assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
     std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
+    if (C.empty())
+      continue;
 
     if (!IsFirst) {
       Out << ", ";
@@ -4991,10 +5437,10 @@ void CWriter::visitInlineAsm(CallInst &CI) {
       DestVal = ResultVals[ValueCount].first;
       DestValNo = ResultVals[ValueCount].second;
     } else
-      DestVal = CI.getArgOperand(ValueCount-ResultVals.size());
+      DestVal = CI.getArgOperand(ValueCount - ResultVals.size());
 
     if (I->isEarlyClobber)
-      C = "&"+C;
+      C = "&" + C;
 
     Out << "\"=" << C << "\"(" << GetValueName(DestVal);
     if (DestValNo != -1)
@@ -5003,21 +5449,22 @@ void CWriter::visitInlineAsm(CallInst &CI) {
     ++ValueCount;
   }
 
-
   // Convert over all the input constraints.
   Out << "\n        :";
   IsFirst = true;
   ValueCount = 0;
   for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-      E = Constraints.end(); I != E; ++I) {
+                                                 E = Constraints.end();
+       I != E; ++I) {
     if (I->Type != InlineAsm::isInput) {
       ++ValueCount;
-      continue;  // Ignore non-input constraints.
+      continue; // Ignore non-input constraints.
     }
 
     assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
     std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
+    if (C.empty())
+      continue;
 
     if (!IsFirst) {
       Out << ", ";
@@ -5025,7 +5472,7 @@ void CWriter::visitInlineAsm(CallInst &CI) {
     }
 
     assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
-    Value *SrcVal = CI.getArgOperand(ValueCount-ResultVals.size());
+    Value *SrcVal = CI.getArgOperand(ValueCount - ResultVals.size());
 
     Out << "\"" << C << "\"(";
     if (!I->isIndirect)
@@ -5038,13 +5485,15 @@ void CWriter::visitInlineAsm(CallInst &CI) {
   // Convert over the clobber constraints.
   IsFirst = true;
   for (InlineAsm::ConstraintInfoVector::iterator I = Constraints.begin(),
-      E = Constraints.end(); I != E; ++I) {
+                                                 E = Constraints.end();
+       I != E; ++I) {
     if (I->Type != InlineAsm::isClobber)
-      continue;  // Ignore non-input constraints.
+      continue; // Ignore non-input constraints.
 
     assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
     std::string C = InterpretASMConstraint(*I);
-    if (C.empty()) continue;
+    if (C.empty())
+      continue;
 
     if (!IsFirst) {
       Out << ", ";
@@ -5062,21 +5511,22 @@ void CWriter::visitAllocaInst(AllocaInst &I) {
   printTypeName(Out, I.getType());
   Out << ") alloca(sizeof(";
   printTypeName(Out, I.getType()->getElementType());
-  if (I.isArrayAllocation()) { 
-    Out << ") * (" ;
+  if (I.isArrayAllocation()) {
+    Out << ") * (";
     writeOperand(I.getArraySize(), ContextCasted);
   }
   Out << "))";
 }
 
 void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
-    gep_type_iterator E, bool isArrayType, GetElementPtrInst *GEPI) {
-  //DEBUG(errs() << "Printing GEP\n");
-  //DEBUG(errs() << "\tPtr: " << *Ptr << "\n");
-  //DEBUG(errs() << "\tGEPI: " << *GEPI <<"\n"); 
+                                 gep_type_iterator E, bool isArrayType,
+                                 GetElementPtrInst *GEPI) {
+  // DEBUG(errs() << "Printing GEP\n");
+  // DEBUG(errs() << "\tPtr: " << *Ptr << "\n");
+  // DEBUG(errs() << "\tGEPI: " << *GEPI <<"\n");
   // If there are no indices, just print out the pointer.
   if (I == E) {
-    //DEBUG(errs() << "I==E: Calling writeOperand()\n");
+    // DEBUG(errs() << "I==E: Calling writeOperand()\n");
     writeOperand(Ptr);
     return;
   }
@@ -5087,7 +5537,7 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   VectorType *LastIndexIsVector = 0;
   {
     for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
-      //LastIndexIsVector = dyn_cast<VectorType>(TmpI.getCurTy());
+      // LastIndexIsVector = dyn_cast<VectorType>(TmpI.getCurTy());
       // CHECK: This change needs thorough testing
       LastIndexIsVector = dyn_cast<VectorType>(TmpI.getIndexedType());
   }
@@ -5096,53 +5546,55 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   // If the last index is into a vector, we can't print it as &a[i][j] because
   // we can't index into a vector with j in GCC.  Instead, emit this as
   // (((float*)&a[i])+j)
-  // TODO: this is no longer true now that we don't represent vectors using gcc-extentions
+  // TODO: this is no longer true now that we don't represent vectors using
+  // gcc-extentions
   if (LastIndexIsVector) {
-    //DEBUG(errs() << "LastIndexIsVector\n");
+    // DEBUG(errs() << "LastIndexIsVector\n");
     Out << "((";
-    printTypeName(Out, PointerType::getUnqual(LastIndexIsVector->getElementType()));
+    printTypeName(Out,
+                  PointerType::getUnqual(LastIndexIsVector->getElementType()));
     Out << ")(";
   }
-  bool isArrayAccess = false; 
+  bool isArrayAccess = false;
 
   if (GEPStack.size() > 0 && GEPStack.top() == GEPI) {
-    //DEBUG(errs() << "Processing load-specific GEP\n");
+    // DEBUG(errs() << "Processing load-specific GEP\n");
     GEPStack.pop();
     isArrayAccess = true;
   } else {
-    //DEBUG(errs() << "I'm hereee!\n");
+    // DEBUG(errs() << "I'm hereee!\n");
     Out << '&';
   }
-  //DEBUG(errs() << "Here!\n");
+  // DEBUG(errs() << "Here!\n");
   // If the first index is 0 (very typical) we can do a number of
   // simplifications to clean up the code.
   Value *FirstOp = I.getOperand();
-  //DEBUG(errs() << "FirstOp: " << *(I.getOperand()) << "\n");
+  // DEBUG(errs() << "FirstOp: " << *(I.getOperand()) << "\n");
   if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) {
-    //DEBUG(errs() << "Calling writeoperand()\n");
+    // DEBUG(errs() << "Calling writeoperand()\n");
     // First index isn't simple, print it the hard way.
     writeOperand(Ptr, ContextNormal, isArrayAccess);
   } else {
-    ++I;  // Skip the zero index.
-    //DEBUG(errs() << "Skipping zero index\n");
+    ++I; // Skip the zero index.
+    // DEBUG(errs() << "Skipping zero index\n");
 
     // Okay, emit the first operand. If Ptr is something that is already address
     // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
     if (isAddressExposed(Ptr)) {
-      //DEBUG(errs() << "Address exposed; calling writeoperandinternal()\n");
+      // DEBUG(errs() << "Address exposed; calling writeoperandinternal()\n");
       writeOperandInternal(Ptr);
     }
-    //else if (I != E && (I.getCurTy())->isStructTy()) {
+    // else if (I != E && (I.getCurTy())->isStructTy()) {
     // NOTE: This change needs to be tested more
-    else if (I != E && (I.isStruct()) ) {
-      //DEBUG(errs() << "Not address exposed; is struct type\n");
+    else if (I != E && (I.isStruct())) {
+      // DEBUG(errs() << "Not address exposed; is struct type\n");
       // If we didn't already emit the first operand, see if we can print it as
       // P->f instead of "P[0].f"
       writeOperand(Ptr);
       Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
-      ++I;  // eat the struct index as well.
+      ++I; // eat the struct index as well.
     } else {
-      //DEBUG(errs() << "In else; emitting *P\n");
+      // DEBUG(errs() << "In else; emitting *P\n");
       // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
       Out << "(*";
       writeOperand(Ptr);
@@ -5153,28 +5605,32 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
   Type *Agg = GEPI->getSourceElementType();
   unsigned CurIdx = 1;
   for (; I != E; ++CurIdx, ++I) {
-    assert(I.getOperand()->getType()->isIntegerTy()); // TODO: indexing a Vector with a Vector is valid, but we don't support it here
-    //DEBUG(errs() << "Type: " << *Agg << "; operand: " << *(I.getOperand()) << "\n");
-    if ((Agg->isStructTy())){
-      //DEBUG(errs() << "Found a struct\n");
+    assert(I.getOperand()
+               ->getType()
+               ->isIntegerTy()); // TODO: indexing a Vector with a Vector is
+                                 // valid, but we don't support it here
+    // DEBUG(errs() << "Type: " << *Agg << "; operand: " << *(I.getOperand()) <<
+    // "\n");
+    if ((Agg->isStructTy())) {
+      // DEBUG(errs() << "Found a struct\n");
       Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
     } else if (Agg->isArrayTy()) {
-      //DEBUG(errs() << "Found an array!\n");
+      // DEBUG(errs() << "Found an array!\n");
       Out << ".array[";
       writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
       Out << ']';
     } else if (!Agg->isVectorTy()) {
-      //DEBUG(errs() << "Not a vector!\n");
+      // DEBUG(errs() << "Not a vector!\n");
       Out << '[';
       writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
       Out << ']';
     } else {
-      //DEBUG(errs() << "In else!\n");
+      // DEBUG(errs() << "In else!\n");
       // If the last index is into a vector, then print it out as "+j)".  This
       // works with the 'LastIndexIsVector' code above.
       if (isa<Constant>(I.getOperand()) &&
           cast<Constant>(I.getOperand())->isNullValue()) {
-        Out << "))";  // avoid "+0".
+        Out << "))"; // avoid "+0".
       } else {
         Out << ")+(";
         writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
@@ -5182,248 +5638,246 @@ void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
       }
     }
     CompositeType *CT = dyn_cast<CompositeType>(Agg);
-    if (!CT || CT->isPointerTy()) 
-    {
-      //DEBUG(errs() << "Something wrong!!\n");
+    if (!CT || CT->isPointerTy()) {
+      // DEBUG(errs() << "Something wrong!!\n");
       break;
     }
-    Value* Index = GEPI->getOperand(CurIdx);
+    Value *Index = GEPI->getOperand(CurIdx);
     if (!CT->indexValid(Index))
-      if (!CT || CT->isPointerTy()) 
-      {
-        //DEBUG(errs() << "Something wrong 2!!\n");
+      if (!CT || CT->isPointerTy()) {
+        // DEBUG(errs() << "Something wrong 2!!\n");
         break;
       }
     Agg = CT->getTypeAtIndex(Index);
   }
   Out << ")";
-  //DEBUG(errs() << "Leaving printGEPExpression\n");
-  }
+  // DEBUG(errs() << "Leaving printGEPExpression\n");
+}
 
-  void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
-      bool IsVolatile, unsigned Alignment /*bytes*/) {
-    //DEBUG(errs() << *OperandType << "; " << *Operand << "\n");
-    bool arrayAccess = false;
-    if(isa<GetElementPtrInst>(Operand)) {
-      //DEBUG(errs() << "ISA Get Element Pointer!\n");
-      arrayAccess = true;
-      GEPStack.push(dyn_cast<GetElementPtrInst>(Operand));
-    }
-    //  if (isAddressExposed(Operand)) {
-    //    DEBUG(errs() << "Is address exposed!!\n");
-    //    writeOperandInternal(Operand);
-    //    return;
-    //  }
-
-    bool IsUnaligned = Alignment &&
-      Alignment < TD->getABITypeAlignment(OperandType);
-    if (!arrayAccess) {
-      if (!IsUnaligned)
-        Out << '*';
-
-      else if (IsUnaligned) {
-        Out << "__UNALIGNED_LOAD__(";
-        printTypeNameUnaligned(Out, OperandType, false);
-        if (IsVolatile) Out << " volatile";
-        Out << ", " << Alignment << ", ";
-      }
+void CWriter::writeMemoryAccess(Value *Operand, Type *OperandType,
+                                bool IsVolatile, unsigned Alignment /*bytes*/) {
+  // DEBUG(errs() << *OperandType << "; " << *Operand << "\n");
+  bool arrayAccess = false;
+  if (isa<GetElementPtrInst>(Operand)) {
+    // DEBUG(errs() << "ISA Get Element Pointer!\n");
+    arrayAccess = true;
+    GEPStack.push(dyn_cast<GetElementPtrInst>(Operand));
+  }
+  //  if (isAddressExposed(Operand)) {
+  //    DEBUG(errs() << "Is address exposed!!\n");
+  //    writeOperandInternal(Operand);
+  //    return;
+  //  }
 
-      else if (IsVolatile) {
-        Out << "(";
-        printTypeName(Out, OperandType, false);
-        Out << "volatile";
-        Out << "*)";
-      } 
+  bool IsUnaligned =
+      Alignment && Alignment < TD->getABITypeAlignment(OperandType);
+  if (!arrayAccess) {
+    if (!IsUnaligned)
+      Out << '*';
+
+    else if (IsUnaligned) {
+      Out << "__UNALIGNED_LOAD__(";
+      printTypeNameUnaligned(Out, OperandType, false);
+      if (IsVolatile)
+        Out << " volatile";
+      Out << ", " << Alignment << ", ";
     }
 
-    writeOperand(Operand,ContextNormal, arrayAccess );
-
-    if (IsUnaligned) {
-      Out << ")";
+    else if (IsVolatile) {
+      Out << "(";
+      printTypeName(Out, OperandType, false);
+      Out << "volatile";
+      Out << "*)";
     }
   }
 
-  void CWriter::visitLoadInst(LoadInst &I) {
-    //DEBUG(errs() << "Visiting Load instruction!\n");
-    //  DEBUG(errs() << "Visiting load: " << I << "\n");
-    writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
-        I.getAlignment());
+  writeOperand(Operand, ContextNormal, arrayAccess);
 
+  if (IsUnaligned) {
+    Out << ")";
   }
+}
 
-  void CWriter::visitStoreInst(StoreInst &I) {
-    //DEBUG(errs() << "Visiting store instruction!\n");
-    writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
-        I.isVolatile(), I.getAlignment());
-    Out << " = ";
-    Value *Operand = I.getOperand(0);
-    unsigned BitMask = 0;
-    if (IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
-      if (!ITy->isPowerOf2ByteWidth())
-        // We have a bit width that doesn't match an even power-of-2 byte
-        // size. Consequently we must & the value with the type's bit mask
-        BitMask = ITy->getBitMask();
-    if (BitMask)
-      Out << "((";
-    writeOperand(Operand, BitMask ? ContextNormal : ContextCasted);
-    if (BitMask)
-      Out << ") & " << BitMask << ")";
-  }
+void CWriter::visitLoadInst(LoadInst &I) {
+  // DEBUG(errs() << "Visiting Load instruction!\n");
+  //  DEBUG(errs() << "Visiting load: " << I << "\n");
+  writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
+                    I.getAlignment());
+}
 
-  void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
-    //  DEBUG(errs() <<"Visiting GEP: " << I << "\n");
-    printGEPExpression(I.getPointerOperand(), gep_type_begin(I),
-        gep_type_end(I), I.getSourceElementType()->isArrayTy(), &I);
-  }
+void CWriter::visitStoreInst(StoreInst &I) {
+  // DEBUG(errs() << "Visiting store instruction!\n");
+  writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
+                    I.isVolatile(), I.getAlignment());
+  Out << " = ";
+  Value *Operand = I.getOperand(0);
+  unsigned BitMask = 0;
+  if (IntegerType *ITy = dyn_cast<IntegerType>(Operand->getType()))
+    if (!ITy->isPowerOf2ByteWidth())
+      // We have a bit width that doesn't match an even power-of-2 byte
+      // size. Consequently we must & the value with the type's bit mask
+      BitMask = ITy->getBitMask();
+  if (BitMask)
+    Out << "((";
+  writeOperand(Operand, BitMask ? ContextNormal : ContextCasted);
+  if (BitMask)
+    Out << ") & " << BitMask << ")";
+}
 
-  void CWriter::visitVAArgInst(VAArgInst &I) {
-    Out << "va_arg(*(va_list*)";
-    writeOperand(I.getOperand(0), ContextCasted);
-    Out << ", ";
-    printTypeName(Out, I.getType());
-    Out << ");\n ";
-  }
+void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  //  DEBUG(errs() <<"Visiting GEP: " << I << "\n");
+  printGEPExpression(I.getPointerOperand(), gep_type_begin(I), gep_type_end(I),
+                     I.getSourceElementType()->isArrayTy(), &I);
+}
 
-  void CWriter::visitInsertElementInst(InsertElementInst &I) {
-    // Start by copying the entire aggregate value into the result variable.
-    writeOperand(I.getOperand(0));
-    Type *EltTy = I.getType()->getElementType();
-    assert(I.getOperand(1)->getType() == EltTy);
-    if (isEmptyType(EltTy)) return;
-
-    // Then do the insert to update the field.
-    Out << ";\n  ";
-    Out << GetValueName(&I) << ".vector[";
-    writeOperand(I.getOperand(2));
-    Out << "] = ";
-    writeOperand(I.getOperand(1), ContextCasted);
-  }
+void CWriter::visitVAArgInst(VAArgInst &I) {
+  Out << "va_arg(*(va_list*)";
+  writeOperand(I.getOperand(0), ContextCasted);
+  Out << ", ";
+  printTypeName(Out, I.getType());
+  Out << ");\n ";
+}
 
-  void CWriter::visitExtractElementInst(ExtractElementInst &I) {
-    assert(!isEmptyType(I.getType()));
-    if (isa<UndefValue>(I.getOperand(0))) {
-      Out << "(";
-      printTypeName(Out, I.getType());
-      Out << ") 0/*UNDEF*/";
-    } else {
-      Out << "(";
-      writeOperand(I.getOperand(0));
-      Out << ").vector[";
-      writeOperand(I.getOperand(1));
-      Out << "]";
-    }
-  }
+void CWriter::visitInsertElementInst(InsertElementInst &I) {
+  // Start by copying the entire aggregate value into the result variable.
+  writeOperand(I.getOperand(0));
+  Type *EltTy = I.getType()->getElementType();
+  assert(I.getOperand(1)->getType() == EltTy);
+  if (isEmptyType(EltTy))
+    return;
 
-  // <result> = shufflevector <n x <ty>> <v1>, <n x <ty>> <v2>, <m x i32> <mask>
-  // ; yields <m x <ty>>
-  void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
-    VectorType *VT = SVI.getType();
-    Type *EltTy = VT->getElementType();
-    VectorType *InputVT = cast<VectorType>(SVI.getOperand(0)->getType());
-    assert(!isEmptyType(VT));
-    assert(InputVT->getElementType() == VT->getElementType());
+  // Then do the insert to update the field.
+  Out << ";\n  ";
+  Out << GetValueName(&I) << ".vector[";
+  writeOperand(I.getOperand(2));
+  Out << "] = ";
+  writeOperand(I.getOperand(1), ContextCasted);
+}
 
-    CtorDeclTypes.insert(VT);
-    Out << "llvm_ctor_";
-    printTypeString(Out, VT, false);
+void CWriter::visitExtractElementInst(ExtractElementInst &I) {
+  assert(!isEmptyType(I.getType()));
+  if (isa<UndefValue>(I.getOperand(0))) {
     Out << "(";
+    printTypeName(Out, I.getType());
+    Out << ") 0/*UNDEF*/";
+  } else {
+    Out << "(";
+    writeOperand(I.getOperand(0));
+    Out << ").vector[";
+    writeOperand(I.getOperand(1));
+    Out << "]";
+  }
+}
+
+// <result> = shufflevector <n x <ty>> <v1>, <n x <ty>> <v2>, <m x i32> <mask>
+// ; yields <m x <ty>>
+void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  VectorType *VT = SVI.getType();
+  Type *EltTy = VT->getElementType();
+  VectorType *InputVT = cast<VectorType>(SVI.getOperand(0)->getType());
+  assert(!isEmptyType(VT));
+  assert(InputVT->getElementType() == VT->getElementType());
 
-    Constant *Zero = Constant::getNullValue(EltTy);
-    unsigned NumElts = VT->getNumElements();
-    unsigned NumInputElts = InputVT->getNumElements(); // n
-    for (unsigned i = 0; i != NumElts; ++i) {
-      if (i) Out << ", ";
-      int SrcVal = SVI.getMaskValue(i);
-      if ((unsigned)SrcVal >= NumInputElts * 2) {
-        Out << "/*undef*/";
+  CtorDeclTypes.insert(VT);
+  Out << "llvm_ctor_";
+  printTypeString(Out, VT, false);
+  Out << "(";
+
+  Constant *Zero = Constant::getNullValue(EltTy);
+  unsigned NumElts = VT->getNumElements();
+  unsigned NumInputElts = InputVT->getNumElements(); // n
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (i)
+      Out << ", ";
+    int SrcVal = SVI.getMaskValue(i);
+    if ((unsigned)SrcVal >= NumInputElts * 2) {
+      Out << "/*undef*/";
+      printConstant(Zero, ContextCasted);
+    } else {
+      // If SrcVal belongs [0, n - 1], it extracts value from <v1>
+      // If SrcVal belongs [n, 2 * n - 1], it extracts value from <v2>
+      // In C++, the value false is converted to zero and the value true is
+      // converted to one
+      Value *Op = SVI.getOperand((unsigned)SrcVal >= NumInputElts);
+      if (isa<Instruction>(Op)) {
+        // Do an extractelement of this value from the appropriate input.
+        Out << "(";
+        writeOperand(Op);
+        Out << ").vector[";
+        Out << ((unsigned)SrcVal >= NumInputElts ? SrcVal - NumInputElts
+                                                 : SrcVal);
+        Out << "]";
+      } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
         printConstant(Zero, ContextCasted);
       } else {
-        // If SrcVal belongs [0, n - 1], it extracts value from <v1>
-        // If SrcVal belongs [n, 2 * n - 1], it extracts value from <v2>
-        // In C++, the value false is converted to zero and the value true is
-        // converted to one
-        Value *Op = SVI.getOperand((unsigned)SrcVal >= NumInputElts);
-        if (isa<Instruction>(Op)) {
-          // Do an extractelement of this value from the appropriate input.
-          Out << "(";
-          writeOperand(Op);
-          Out << ").vector[";
-          Out << ((unsigned)SrcVal >= NumInputElts ? SrcVal - NumInputElts : SrcVal);
-          Out << "]";
-        } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
-          printConstant(Zero, ContextCasted);
-        } else {
-          printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal &
-                (NumElts-1)),
-              ContextNormal);
-        }
+        printConstant(
+            cast<ConstantVector>(Op)->getOperand(SrcVal & (NumElts - 1)),
+            ContextNormal);
       }
     }
-    Out << ")";
   }
+  Out << ")";
+}
+
+void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
+  // Start by copying the entire aggregate value into the result variable.
+  writeOperand(IVI.getOperand(0));
+  Type *EltTy = IVI.getOperand(1)->getType();
+  if (isEmptyType(EltTy))
+    return;
+
+  // Then do the insert to update the field.
+  Out << ";\n  ";
+  Out << GetValueName(&IVI);
+  for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end(); i != e;
+       ++i) {
+    Type *IndexedTy = ExtractValueInst::getIndexedType(
+        IVI.getOperand(0)->getType(), makeArrayRef(b, i));
+    assert(IndexedTy);
+    if (IndexedTy->isArrayTy())
+      Out << ".array[" << *i << "]";
+    else
+      Out << ".field" << *i;
+  }
+  Out << " = ";
+  writeOperand(IVI.getOperand(1), ContextCasted);
+}
 
-  void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
-    // Start by copying the entire aggregate value into the result variable.
-    writeOperand(IVI.getOperand(0));
-    Type *EltTy = IVI.getOperand(1)->getType();
-    if (isEmptyType(EltTy)) return;
-
-    // Then do the insert to update the field.
-    Out << ";\n  ";
-    Out << GetValueName(&IVI);
-    for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
-        i != e; ++i) {
-      Type *IndexedTy =
-        ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(),
-            makeArrayRef(b, i));
-      assert(IndexedTy);
+void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
+  Out << "(";
+  if (isa<UndefValue>(EVI.getOperand(0))) {
+    Out << "(";
+    printTypeName(Out, EVI.getType());
+    Out << ") 0/*UNDEF*/";
+  } else {
+    writeOperand(EVI.getOperand(0));
+    for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
+         i != e; ++i) {
+      Type *IndexedTy = ExtractValueInst::getIndexedType(
+          EVI.getOperand(0)->getType(), makeArrayRef(b, i));
       if (IndexedTy->isArrayTy())
         Out << ".array[" << *i << "]";
       else
         Out << ".field" << *i;
     }
-    Out << " = ";
-    writeOperand(IVI.getOperand(1), ContextCasted);
-  }
-
-  void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
-    Out << "(";
-    if (isa<UndefValue>(EVI.getOperand(0))) {
-      Out << "(";
-      printTypeName(Out, EVI.getType());
-      Out << ") 0/*UNDEF*/";
-    } else {
-      writeOperand(EVI.getOperand(0));
-      for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
-          i != e; ++i) {
-        Type *IndexedTy =
-          ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(),
-              makeArrayRef(b, i));
-        if (IndexedTy->isArrayTy())
-          Out << ".array[" << *i << "]";
-        else
-          Out << ".field" << *i;
-      }
-    }
-    Out << ")";
   }
+  Out << ")";
+}
 
-  //===----------------------------------------------------------------------===//
-  //                       External Interface declaration
-  //===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
 
-  bool CTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
-      raw_pwrite_stream &Out,
-      raw_pwrite_stream *Out2,
-      CodeGenFileType FileType,
-      bool DisableVerify,
-      MachineModuleInfo *MMI){
+bool CTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *Out2,
+    CodeGenFileType FileType, bool DisableVerify, MachineModuleInfo *MMI) {
 
-    if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
+  if (FileType != TargetMachine::CGFT_AssemblyFile)
+    return true;
 
-    PM.add(createGCLoweringPass());
-    PM.add(createLowerInvokePass());
-    PM.add(createCFGSimplificationPass());   // clean up after lower invoke.
-    PM.add(new CWriter(Out));
-    return false;
-  }
+  PM.add(createGCLoweringPass());
+  PM.add(createLowerInvokePass());
+  PM.add(createCFGSimplificationPass()); // clean up after lower invoke.
+  PM.add(new CWriter(Out));
+  return false;
+}
diff --git a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h
index 07f3b80af5d7fc4eda35df068e59eb6b7e79202d..33d936d9d09026e961d8bf723263c32baa0bd390 100644
--- a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h
+++ b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CBackend.h
@@ -1,25 +1,34 @@
 
 #include "CTargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -29,21 +38,12 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Pass.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Transforms/Utils/Mem2Reg.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/Mem2Reg.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 
 #include <set>
 #include <stack>
@@ -55,290 +55,304 @@
 #define PRIVATE_ADDRSPACE 5
 
 namespace {
-  using namespace llvm;
-
-  class CBEMCAsmInfo : public MCAsmInfo {
-  public:
-    CBEMCAsmInfo() {
-      PrivateGlobalPrefix = "";
-    }
-  };
-
-  /// CWriter - This class is the main chunk of code that converts an LLVM
-  /// module to a C translation unit.
-  class CWriter : public FunctionPass, public InstVisitor<CWriter> {
-    std::string _Out;
-    raw_string_ostream Out;
-    raw_pwrite_stream &FileOut;
-    IntrinsicLowering *IL;
-    LoopInfo *LI;
-    PostDominatorTree *PDT;
-    DominatorTree *DT;
-    ScalarEvolution *SE;
-    IVUsers *IU;
-    AssumptionCache *AC;
-    
-    const Module *TheModule;
-    const MCAsmInfo* TAsm;
-    const MCRegisterInfo *MRI;
-    const MCObjectFileInfo *MOFI;
-    MCContext *TCtx;
-    const DataLayout* TD;
-
-    std::map<const ConstantFP *, unsigned> FPConstantMap;
-    std::set<const Argument*> ByValParams;
-    
-    // Set for storing all loop induction variables 
-    std::set<PHINode*> LInductionVars;
-    std::map<Loop*, PHINode*> LoopIndVarsMap;
-
-    unsigned FPCounter;
-    unsigned OpaqueCounter;
-
-    DenseMap<const Value*, unsigned> AnonValueNumbers;
-    unsigned NextAnonValueNumber;
-
-    /// UnnamedStructIDs - This contains a unique ID for each struct that is
-    /// either anonymous or has no name.
-    DenseMap<StructType*, unsigned> UnnamedStructIDs;
-    unsigned NextAnonStructNumber;
-
-    std::set<Type*> TypedefDeclTypes;
-    std::set<Type*> SelectDeclTypes;
-    std::set<std::pair<CmpInst::Predicate, VectorType*>> CmpDeclTypes;
-    std::set<std::pair<CastInst::CastOps, std::pair<Type*, Type*>>> CastOpDeclTypes;
-    std::set<std::pair<unsigned, Type*>> InlineOpDeclTypes;
-    std::set<Type*> CtorDeclTypes;
-
-    DenseMap<std::pair<FunctionType*, std::pair<AttributeList, CallingConv::ID>>, unsigned> UnnamedFunctionIDs;
-    unsigned NextFunctionNumber;
-
-    // This is used to keep track of intrinsics that get generated to a lowered
-    // function. We must generate the prototypes before the function body which
-    // will only be expanded on first use
-    std::vector<Function*> prototypesToGen;
-    
-    // Set for keeping track of visited blocks to avoid goto when possible
-    std::set<BasicBlock*> VisitedBlocks;
-    std::set<BasicBlock*> CompVisitedBlocks;
-    std::set<BasicBlock*> FindVisitedBlocks;
-    std::set<BasicBlock*> ReplicateBlocks;
-    std::stack<BasicBlock*> ImmPostDommBlocks;
-    std::stack<BasicBlock*> ElseBlocks;
-    std::stack<BranchInst*> ElseBranches;
-    std::stack<GetElementPtrInst*> GEPStack;
-  public:
-    static char ID;
-    explicit CWriter(raw_pwrite_stream &o)
-      : FunctionPass(ID), Out(_Out), FileOut(o), IL(0), LI(0),
-        TheModule(0), TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0),
-        OpaqueCounter(0), NextAnonValueNumber(0),
-        NextAnonStructNumber(0), NextFunctionNumber(0), PDT(0) {
-      FPCounter = 0;
-    }
-
-    virtual StringRef getPassName() const { return "C backend"; }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<LoopInfoWrapperPass>();
-      // Adding PDT pass to avoid code duplication
-      AU.addRequired<PostDominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<AssumptionCacheTracker>();
-//      AU.addRequiredID(LoopSimplifyID);
-//      AU.addRequired<LoopSimplifyPass>();
-
-//      AU.addRequired<IVUsersWrapperPass>();
-      //AU.addRequired<PromotePass>();
-      AU.setPreservesCFG();
-    }
-
-    virtual bool doInitialization(Module &M);
-    virtual bool doFinalization(Module &M);
-    virtual bool runOnFunction(Function &F);
-
-  private:
-
-    void generateHeader(Module &M);
-    void declareOneGlobalVariable(GlobalVariable* I);
-
-    void forwardDeclareStructs(raw_ostream &Out, Type *Ty, std::set<Type*> &TypesPrinted);
-    void forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty,
-					std::set<Type*> &TypesPrinted);
-
-    raw_ostream &printFunctionProto(raw_ostream &Out, FunctionType *Ty,
-				    //std::pair<AttributeSet, CallingConv::ID> Attrs,
-				    std::pair<AttributeList, CallingConv::ID> Attrs,
-				    const std::string &Name,
-				    Function::arg_iterator ArgList,
-				    //Function::ArgumentListType *ArgList,
-				    bool isKernel);
-    
-    raw_ostream &printFunctionProto(raw_ostream &Out, Function *F) {
-      bool isKernel = false;
-      if (NamedMDNode * KernelMD = F->getParent()->getNamedMetadata("opencl.kernels")) {
-        for (auto iter : KernelMD->operands()) {
-          const MDOperand *KernelMDOp = iter->operands().begin();
-          Metadata *KMD = KernelMDOp->get();
-          if(ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)){
-            Value *KMDVal = KMDVAM->getValue();
-            Function *KMDFunc = dyn_cast<Function>(KMDVal);
-            if(KMDFunc == F) {
-              isKernel = true;
-            }
+using namespace llvm;
+
+class CBEMCAsmInfo : public MCAsmInfo {
+public:
+  CBEMCAsmInfo() { PrivateGlobalPrefix = ""; }
+};
+
+/// CWriter - This class is the main chunk of code that converts an LLVM
+/// module to a C translation unit.
+class CWriter : public FunctionPass, public InstVisitor<CWriter> {
+  std::string _Out;
+  raw_string_ostream Out;
+  raw_pwrite_stream &FileOut;
+  IntrinsicLowering *IL;
+  LoopInfo *LI;
+  PostDominatorTree *PDT;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  IVUsers *IU;
+  AssumptionCache *AC;
+
+  const Module *TheModule;
+  const MCAsmInfo *TAsm;
+  const MCRegisterInfo *MRI;
+  const MCObjectFileInfo *MOFI;
+  MCContext *TCtx;
+  const DataLayout *TD;
+
+  std::map<const ConstantFP *, unsigned> FPConstantMap;
+  std::set<const Argument *> ByValParams;
+
+  // Set for storing all loop induction variables
+  std::set<PHINode *> LInductionVars;
+  std::map<Loop *, PHINode *> LoopIndVarsMap;
+
+  unsigned FPCounter;
+  unsigned OpaqueCounter;
+
+  DenseMap<const Value *, unsigned> AnonValueNumbers;
+  unsigned NextAnonValueNumber;
+
+  /// UnnamedStructIDs - This contains a unique ID for each struct that is
+  /// either anonymous or has no name.
+  DenseMap<StructType *, unsigned> UnnamedStructIDs;
+  unsigned NextAnonStructNumber;
+
+  std::set<Type *> TypedefDeclTypes;
+  std::set<Type *> SelectDeclTypes;
+  std::set<std::pair<CmpInst::Predicate, VectorType *>> CmpDeclTypes;
+  std::set<std::pair<CastInst::CastOps, std::pair<Type *, Type *>>>
+      CastOpDeclTypes;
+  std::set<std::pair<unsigned, Type *>> InlineOpDeclTypes;
+  std::set<Type *> CtorDeclTypes;
+
+  DenseMap<std::pair<FunctionType *, std::pair<AttributeList, CallingConv::ID>>,
+           unsigned>
+      UnnamedFunctionIDs;
+  unsigned NextFunctionNumber;
+
+  // This is used to keep track of intrinsics that get generated to a lowered
+  // function. We must generate the prototypes before the function body which
+  // will only be expanded on first use
+  std::vector<Function *> prototypesToGen;
+
+  // Set for keeping track of visited blocks to avoid goto when possible
+  std::set<BasicBlock *> VisitedBlocks;
+  std::set<BasicBlock *> CompVisitedBlocks;
+  std::set<BasicBlock *> FindVisitedBlocks;
+  std::set<BasicBlock *> ReplicateBlocks;
+  std::stack<BasicBlock *> ImmPostDommBlocks;
+  std::stack<BasicBlock *> ElseBlocks;
+  std::stack<BranchInst *> ElseBranches;
+  std::stack<GetElementPtrInst *> GEPStack;
+
+public:
+  static char ID;
+  explicit CWriter(raw_pwrite_stream &o)
+      : FunctionPass(ID), Out(_Out), FileOut(o), IL(0), LI(0), TheModule(0),
+        TAsm(0), MRI(0), MOFI(0), TCtx(0), TD(0), OpaqueCounter(0),
+        NextAnonValueNumber(0), NextAnonStructNumber(0), NextFunctionNumber(0),
+        PDT(0) {
+    FPCounter = 0;
+  }
+
+  virtual StringRef getPassName() const { return "C backend"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<LoopInfoWrapperPass>();
+    // Adding PDT pass to avoid code duplication
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
+    //      AU.addRequiredID(LoopSimplifyID);
+    //      AU.addRequired<LoopSimplifyPass>();
+
+    //      AU.addRequired<IVUsersWrapperPass>();
+    // AU.addRequired<PromotePass>();
+    AU.setPreservesCFG();
+  }
+
+  virtual bool doInitialization(Module &M);
+  virtual bool doFinalization(Module &M);
+  virtual bool runOnFunction(Function &F);
+
+private:
+  void generateHeader(Module &M);
+  void declareOneGlobalVariable(GlobalVariable *I);
+
+  void forwardDeclareStructs(raw_ostream &Out, Type *Ty,
+                             std::set<Type *> &TypesPrinted);
+  void forwardDeclareFunctionTypedefs(raw_ostream &Out, Type *Ty,
+                                      std::set<Type *> &TypesPrinted);
+
+  raw_ostream &
+  printFunctionProto(raw_ostream &Out, FunctionType *Ty,
+                     // std::pair<AttributeSet, CallingConv::ID> Attrs,
+                     std::pair<AttributeList, CallingConv::ID> Attrs,
+                     const std::string &Name, Function::arg_iterator ArgList,
+                     // Function::ArgumentListType *ArgList,
+                     bool isKernel);
+
+  raw_ostream &printFunctionProto(raw_ostream &Out, Function *F) {
+    bool isKernel = false;
+    if (NamedMDNode *KernelMD =
+            F->getParent()->getNamedMetadata("opencl.kernels")) {
+      for (auto iter : KernelMD->operands()) {
+        const MDOperand *KernelMDOp = iter->operands().begin();
+        Metadata *KMD = KernelMDOp->get();
+        if (ValueAsMetadata *KMDVAM = dyn_cast<ValueAsMetadata>(KMD)) {
+          Value *KMDVal = KMDVAM->getValue();
+          Function *KMDFunc = dyn_cast<Function>(KMDVal);
+          if (KMDFunc == F) {
+            isKernel = true;
           }
         }
       }
-      
-      return printFunctionProto(Out, F->getFunctionType(), std::make_pair(F->getAttributes(), F->getCallingConv()), GetValueName(F), NULL, isKernel);
-      
     }
 
-    raw_ostream &printFunctionDeclaration(raw_ostream &Out, FunctionType *Ty,
-        std::pair<AttributeList, CallingConv::ID> PAL = std::make_pair(AttributeList(), CallingConv::C));
-    raw_ostream &printStructDeclaration(raw_ostream &Out, StructType *Ty);
-    raw_ostream &printArrayDeclaration(raw_ostream &Out, ArrayType *Ty);
-    raw_ostream &printVectorDeclaration(raw_ostream &Out, VectorType *Ty);
-
-    raw_ostream &printTypeName(raw_ostream &Out, Type *Ty,
-			       bool isSigned = false,
-			       std::pair<AttributeList, CallingConv::ID>
-			       PAL = std::make_pair(AttributeList(), CallingConv::C));
-    raw_ostream &printTypeNameUnaligned(raw_ostream &Out, Type *Ty, bool isSigned = false);
-    raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned);
-    raw_ostream &printTypeString(raw_ostream &Out, Type *Ty, bool isSigned);
-
-    std::string getStructName(StructType *ST);
-    std::string getFunctionName(FunctionType *FT,
-				std::pair<AttributeList, CallingConv::ID> PAL
-				= std::make_pair(AttributeList(), CallingConv::C));
-    std::string getArrayName(ArrayType *AT);
-    std::string getVectorName(VectorType *VT, bool Aligned);
-
-    enum OperandContext {
-      ContextNormal,
-      ContextCasted,
-      // Casted context means the type-cast will be implicit,
-      // such as the RHS of a `var = RHS;` expression
-      // or inside a struct initializer expression
-      ContextStatic
-        // Static context means that it is being used in as a static initializer
-        // (also implies ContextCasted)
-    };
-
-    void writeOperandDeref(Value *Operand);
-    void writeOperand(Value *Operand, enum OperandContext Context = ContextNormal, bool arrayAccess = false);
-    void writeInstComputationInline(Instruction &I);
-    void writeOperandInternal(Value *Operand, enum OperandContext Context = ContextNormal);
-    void writeOperandWithCast(Value* Operand, unsigned Opcode);
-    void opcodeNeedsCast(unsigned Opcode, bool &shouldCast, bool &castIsSigned);
-
-    void writeOperandWithCast(Value* Operand, ICmpInst &I);
-    bool writeInstructionCast(Instruction &I);
-    void writeMemoryAccess(Value *Operand, Type *OperandType,
-        bool IsVolatile, unsigned Alignment);
-
-    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
-
-    void lowerIntrinsics(Function &F);
-    /// Prints the definition of the intrinsic function F. Supports the
-    /// intrinsics which need to be explicitly defined in the CBackend.
-    void printIntrinsicDefinition(Function &F, raw_ostream &Out);
-    void printIntrinsicDefinition(FunctionType *funT,
-        unsigned Opcode, std::string OpName,
-        raw_ostream &Out);
-
-    void printModuleTypes(raw_ostream &Out);
-    void printContainedTypes(raw_ostream &Out, Type *Ty, std::set<Type*> &);
-
-    void printFloatingPointConstants(Function &F);
-    void printFloatingPointConstants(const Constant *C);
-
-    void printFunction(Function &);
-    void printBasicBlock(BasicBlock *BB);
-    void printLoop(Loop *L);
-
-    void printCast(unsigned opcode, Type *SrcTy, Type *DstTy);
-    void printConstant(Constant *CPV, enum OperandContext Context);
-    void printConstantWithCast(Constant *CPV, unsigned Opcode);
-    bool printConstExprCast(ConstantExpr *CE);
-    void printConstantArray(ConstantArray *CPA, enum OperandContext Context);
-    void printConstantVector(ConstantVector *CV, enum OperandContext Context);
-    void printConstantDataSequential(ConstantDataSequential *CDS, enum OperandContext Context);
-    bool printConstantString(Constant *C, enum OperandContext Context);
-
-    bool isEmptyType(Type *Ty) const;
-    bool isAddressExposed(Value *V) const;
-    bool isInlinableInst(Instruction &I) const;
-    AllocaInst *isDirectAlloca(Value *V) const;
-    bool isInlineAsm(Instruction& I) const;
-
-    // Instruction visitation functions
-    friend class InstVisitor<CWriter>;
-
-    void visitReturnInst(ReturnInst &I);
-    void visitBranchInst(BranchInst &I);
-    void visitSwitchInst(SwitchInst &I);
-    void visitIndirectBrInst(IndirectBrInst &I);
-    void visitInvokeInst(InvokeInst &I) {
-      llvm_unreachable("Lowerinvoke pass didn't work!");
-    }
-    void visitResumeInst(ResumeInst &I) {
-      llvm_unreachable("DwarfEHPrepare pass didn't work!");
-    }
-    void visitUnreachableInst(UnreachableInst &I);
-
-    void visitPHINode(PHINode &I);
-    void visitBinaryOperator(BinaryOperator &I);
-    void visitICmpInst(ICmpInst &I);
-    void visitFCmpInst(FCmpInst &I);
-
-    void visitCastInst (CastInst &I);
-    void visitSelectInst(SelectInst &I);
-    void visitCallInst (CallInst &I);
-    void visitInlineAsm(CallInst &I);
-    bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID);
-
-    void visitAllocaInst(AllocaInst &I);
-    void visitLoadInst  (LoadInst   &I);
-    void visitStoreInst (StoreInst  &I);
-    void visitGetElementPtrInst(GetElementPtrInst &I);
-    void visitVAArgInst (VAArgInst &I);
-
-    void visitInsertElementInst(InsertElementInst &I);
-    void visitExtractElementInst(ExtractElementInst &I);
-    void visitShuffleVectorInst(ShuffleVectorInst &SVI);
-
-    void visitInsertValueInst(InsertValueInst &I);
-    void visitExtractValueInst(ExtractValueInst &I);
-    void visitInstruction(Instruction &I) {
+    return printFunctionProto(
+        Out, F->getFunctionType(),
+        std::make_pair(F->getAttributes(), F->getCallingConv()),
+        GetValueName(F), NULL, isKernel);
+  }
+
+  raw_ostream &
+  printFunctionDeclaration(raw_ostream &Out, FunctionType *Ty,
+                           std::pair<AttributeList, CallingConv::ID> PAL =
+                               std::make_pair(AttributeList(), CallingConv::C));
+  raw_ostream &printStructDeclaration(raw_ostream &Out, StructType *Ty);
+  raw_ostream &printArrayDeclaration(raw_ostream &Out, ArrayType *Ty);
+  raw_ostream &printVectorDeclaration(raw_ostream &Out, VectorType *Ty);
+
+  raw_ostream &printTypeName(raw_ostream &Out, Type *Ty, bool isSigned = false,
+                             std::pair<AttributeList, CallingConv::ID> PAL =
+                                 std::make_pair(AttributeList(),
+                                                CallingConv::C));
+  raw_ostream &printTypeNameUnaligned(raw_ostream &Out, Type *Ty,
+                                      bool isSigned = false);
+  raw_ostream &printSimpleType(raw_ostream &Out, Type *Ty, bool isSigned);
+  raw_ostream &printTypeString(raw_ostream &Out, Type *Ty, bool isSigned);
+
+  std::string getStructName(StructType *ST);
+  std::string getFunctionName(FunctionType *FT,
+                              std::pair<AttributeList, CallingConv::ID> PAL =
+                                  std::make_pair(AttributeList(),
+                                                 CallingConv::C));
+  std::string getArrayName(ArrayType *AT);
+  std::string getVectorName(VectorType *VT, bool Aligned);
+
+  enum OperandContext {
+    ContextNormal,
+    ContextCasted,
+    // Casted context means the type-cast will be implicit,
+    // such as the RHS of a `var = RHS;` expression
+    // or inside a struct initializer expression
+    ContextStatic
+    // Static context means that it is being used in as a static initializer
+    // (also implies ContextCasted)
+  };
+
+  void writeOperandDeref(Value *Operand);
+  void writeOperand(Value *Operand, enum OperandContext Context = ContextNormal,
+                    bool arrayAccess = false);
+  void writeInstComputationInline(Instruction &I);
+  void writeOperandInternal(Value *Operand,
+                            enum OperandContext Context = ContextNormal);
+  void writeOperandWithCast(Value *Operand, unsigned Opcode);
+  void opcodeNeedsCast(unsigned Opcode, bool &shouldCast, bool &castIsSigned);
+
+  void writeOperandWithCast(Value *Operand, ICmpInst &I);
+  bool writeInstructionCast(Instruction &I);
+  void writeMemoryAccess(Value *Operand, Type *OperandType, bool IsVolatile,
+                         unsigned Alignment);
+
+  std::string InterpretASMConstraint(InlineAsm::ConstraintInfo &c);
+
+  void lowerIntrinsics(Function &F);
+  /// Prints the definition of the intrinsic function F. Supports the
+  /// intrinsics which need to be explicitly defined in the CBackend.
+  void printIntrinsicDefinition(Function &F, raw_ostream &Out);
+  void printIntrinsicDefinition(FunctionType *funT, unsigned Opcode,
+                                std::string OpName, raw_ostream &Out);
+
+  void printModuleTypes(raw_ostream &Out);
+  void printContainedTypes(raw_ostream &Out, Type *Ty, std::set<Type *> &);
+
+  void printFloatingPointConstants(Function &F);
+  void printFloatingPointConstants(const Constant *C);
+
+  void printFunction(Function &);
+  void printBasicBlock(BasicBlock *BB);
+  void printLoop(Loop *L);
+
+  void printCast(unsigned opcode, Type *SrcTy, Type *DstTy);
+  void printConstant(Constant *CPV, enum OperandContext Context);
+  void printConstantWithCast(Constant *CPV, unsigned Opcode);
+  bool printConstExprCast(ConstantExpr *CE);
+  void printConstantArray(ConstantArray *CPA, enum OperandContext Context);
+  void printConstantVector(ConstantVector *CV, enum OperandContext Context);
+  void printConstantDataSequential(ConstantDataSequential *CDS,
+                                   enum OperandContext Context);
+  bool printConstantString(Constant *C, enum OperandContext Context);
+
+  bool isEmptyType(Type *Ty) const;
+  bool isAddressExposed(Value *V) const;
+  bool isInlinableInst(Instruction &I) const;
+  AllocaInst *isDirectAlloca(Value *V) const;
+  bool isInlineAsm(Instruction &I) const;
+
+  // Instruction visitation functions
+  friend class InstVisitor<CWriter>;
+
+  void visitReturnInst(ReturnInst &I);
+  void visitBranchInst(BranchInst &I);
+  void visitSwitchInst(SwitchInst &I);
+  void visitIndirectBrInst(IndirectBrInst &I);
+  void visitInvokeInst(InvokeInst &I) {
+    llvm_unreachable("Lowerinvoke pass didn't work!");
+  }
+  void visitResumeInst(ResumeInst &I) {
+    llvm_unreachable("DwarfEHPrepare pass didn't work!");
+  }
+  void visitUnreachableInst(UnreachableInst &I);
+
+  void visitPHINode(PHINode &I);
+  void visitBinaryOperator(BinaryOperator &I);
+  void visitICmpInst(ICmpInst &I);
+  void visitFCmpInst(FCmpInst &I);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitCallInst(CallInst &I);
+  void visitInlineAsm(CallInst &I);
+  bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID);
+
+  void visitAllocaInst(AllocaInst &I);
+  void visitLoadInst(LoadInst &I);
+  void visitStoreInst(StoreInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitVAArgInst(VAArgInst &I);
+
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &SVI);
+
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInstruction(Instruction &I) {
 #ifndef NDEBUG
-      errs() << "C Writer does not know about " << I;
+    errs() << "C Writer does not know about " << I;
 #endif
-      llvm_unreachable(0);
-    }
-
-    void outputLValue(Instruction *I) {
-      Out << "  " << GetValueName(I) << " = ";
-    }
-
-    bool extractIndVarChain(Instruction *Inst, std::stack<Instruction*> *IndVarChain, Instruction *Branch, unsigned indent);
-    
-    bool traverseUseDefChain(Instruction*I, PHINode*PI);
-    bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To);
-    void printPHICopiesForSuccessor(BasicBlock *CurBlock,
-        BasicBlock *Successor, unsigned Indent);
-    void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock,
-        unsigned Indent);
-    void printGEPExpression(Value *Ptr, gep_type_iterator I, gep_type_iterator E, bool isArrayType, GetElementPtrInst*);
-
-
-    bool findLoopBranch(BranchInst **LBranch, BasicBlock* CurBlock, BasicBlock* LHeader, std::set<BasicBlock*>*visitSet);
-    std::string GetValueName(Value *Operand);
-    void printBBorLoop(BasicBlock *BB);
-
-    bool compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm);
-    bool findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock, BasicBlock *ImmPostDomm);
-  };
-}
+    llvm_unreachable(0);
+  }
+
+  void outputLValue(Instruction *I) { Out << "  " << GetValueName(I) << " = "; }
+
+  bool extractIndVarChain(Instruction *Inst,
+                          std::stack<Instruction *> *IndVarChain,
+                          Instruction *Branch, unsigned indent);
+
+  bool traverseUseDefChain(Instruction *I, PHINode *PI);
+  bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To);
+  void printPHICopiesForSuccessor(BasicBlock *CurBlock, BasicBlock *Successor,
+                                  unsigned Indent);
+  void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock,
+                          unsigned Indent);
+  void printGEPExpression(Value *Ptr, gep_type_iterator I, gep_type_iterator E,
+                          bool isArrayType, GetElementPtrInst *);
+
+  bool findLoopBranch(BranchInst **LBranch, BasicBlock *CurBlock,
+                      BasicBlock *LHeader, std::set<BasicBlock *> *visitSet);
+  std::string GetValueName(Value *Operand);
+  void printBBorLoop(BasicBlock *BB);
+
+  bool compareBlocks(BasicBlock *CurrBlock, BasicBlock *CompBlock,
+                     BasicBlock *ImmPostDomm);
+  bool findMatch(BasicBlock *CurrBlock, BasicBlock *CompBlock,
+                 BasicBlock *ImmPostDomm);
+};
+} // namespace
diff --git a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h
index b6c02fc46186f78ee085ff0d7fb050ad2002743f..1c61289817d148365742f89f3f3999500283bd8a 100644
--- a/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h
+++ b/hpvm/projects/llvm-cbe/lib/Target/CBackend/CTargetMachine.h
@@ -14,23 +14,21 @@
 #ifndef CTARGETMACHINE_H
 #define CTARGETMACHINE_H
 
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
 struct CTargetMachine : public TargetMachine {
-  
-  // NOTE: Interface change 
-  CTargetMachine(const Target &T, const Triple &TargetTriple,
-		 StringRef CPU, StringRef FS,
-		 const TargetOptions &Options,
-		 Optional<Reloc::Model> RM,
-		 Optional<CodeModel::Model> CM,
-		 CodeGenOpt::Level OL, bool JIT)
-    
-    : TargetMachine(T, "", TargetTriple, CPU, FS, Options) { }
+
+  // NOTE: Interface change
+  CTargetMachine(const Target &T, const Triple &TargetTriple, StringRef CPU,
+                 StringRef FS, const TargetOptions &Options,
+                 Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                 CodeGenOpt::Level OL, bool JIT)
+
+      : TargetMachine(T, "", TargetTriple, CPU, FS, Options) {}
 
   /// Add passes to the specified pass manager to get the specified file
   /// emitted.  Typically this will involve several steps of code generation.
@@ -38,21 +36,20 @@ struct CTargetMachine : public TargetMachine {
   /*bool addPassesToEmitFile(
     PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
     bool DisableVerify = true, AnalysisID StartBefore = nullptr,
-    AnalysisID StartAfter = nullptr, AnalysisID StopBefore = nullptr, 
+    AnalysisID StartAfter = nullptr, AnalysisID StopBefore = nullptr,
     AnalysisID StopAfter = nullptr) override;
     //MachineFunctionInitializer *MFInitializer = nullptr) override;
   */
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &Out,
-                                   raw_pwrite_stream *Out2, CodeGenFileType FileType,
+                                   raw_pwrite_stream *Out2,
+                                   CodeGenFileType FileType,
                                    bool DisableVerify = true,
-				   MachineModuleInfo *MMI = nullptr) override;
-
+                                   MachineModuleInfo *MMI = nullptr) override;
 };
 
 extern Target TheCBackendTarget;
 
-} // End llvm namespace
-
+} // namespace llvm
 
 #endif
diff --git a/hpvm/projects/llvm-cbe/test/APInt-C.cpp b/hpvm/projects/llvm-cbe/test/APInt-C.cpp
index 0dec791c141d94939bf0762120be55ce184dcfb6..c44440985a0b50a57bd25e1995d39cd904ec32c5 100644
--- a/hpvm/projects/llvm-cbe/test/APInt-C.cpp
+++ b/hpvm/projects/llvm-cbe/test/APInt-C.cpp
@@ -1,9 +1,9 @@
 // This file is a part of Julia. License is MIT: http://julialang.org/license
 
 #include "llvm-version.h"
-#include <llvm/ADT/ArrayRef.h>
-#include <llvm/ADT/APInt.h>
 #include <llvm/ADT/APFloat.h>
+#include <llvm/ADT/APInt.h>
+#include <llvm/ADT/ArrayRef.h>
 #include <llvm/Support/MathExtras.h>
 
 #include "APInt-C.h"
@@ -12,524 +12,539 @@
 using namespace llvm;
 
 #if JL_LLVM_VERSION >= 30900
-inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
-    return alignTo(Value, Align, Skew);
+inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align,
+                                   uint64_t Skew = 0) {
+  return alignTo(Value, Align, Skew);
 }
 #endif
 
 /* create "APInt s" from "integerPart *ps" */
-#define CREATE(s) \
-    APInt s; \
-    if ((numbits % integerPartWidth) != 0) { \
-        /* use LLT_ALIGN to round the memory area up to the nearest integerPart-sized chunk */ \
-        unsigned nbytes = RoundUpToAlignment(numbits, integerPartWidth) / host_char_bit; \
-        integerPart *data_a64 = (integerPart*)alloca(nbytes); \
-        /* TODO: this memcpy assumes little-endian,
-         * for big-endian, need to align the copy to the other end */ \
-        memcpy(data_a64, p##s, RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); \
-        s = APInt(numbits, makeArrayRef(data_a64, nbytes / sizeof(integerPart))); \
-    } \
-    else { \
-        s = APInt(numbits, makeArrayRef(p##s, numbits / integerPartWidth)); \
-    }
+#define CREATE(s)                                                              \
+  APInt s;                                                                     \
+  if ((numbits % integerPartWidth) != 0) {                                     \
+    /* use LLT_ALIGN to round the memory area up to the nearest \                                                                             \
+     * integerPart-sized chunk */                                              \
+    unsigned nbytes =                                                          \
+        RoundUpToAlignment(numbits, integerPartWidth) / host_char_bit;         \
+    integerPart *data_a64 = (integerPart *)alloca(nbytes);                     \
+    /* TODO: this memcpy assumes little-endian, \ for big-endian, need to                                                       \
+     * align the copy to the other end */                                      \
+    memcpy(data_a64, p##s,                                                     \
+           RoundUpToAlignment(numbits, host_char_bit) / host_char_bit);        \
+    s = APInt(numbits, makeArrayRef(data_a64, nbytes / sizeof(integerPart)));  \
+  } else {                                                                     \
+    s = APInt(numbits, makeArrayRef(p##s, numbits / integerPartWidth));        \
+  }
 
 /* assign to "integerPart *pr" from "APInt a" */
-#define ASSIGN(r, a) \
-    if (numbits <= 8) \
-        *(uint8_t*)p##r = a.getZExtValue(); \
-    else if (numbits <= 16) \
-        *(uint16_t*)p##r = a.getZExtValue(); \
-    else if (numbits <= 32) \
-        *(uint32_t*)p##r = a.getZExtValue(); \
-    else if (numbits <= 64) \
-        *(uint64_t*)p##r = a.getZExtValue(); \
-    else \
-        memcpy(p##r, a.getRawData(), RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); \
-
-extern "C" JL_DLLEXPORT
-void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr) {
-    APInt z(numbits, 0);
-    CREATE(a)
-    z -= a;
-    ASSIGN(r, z)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a += b;
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a -= b;
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a *= b;
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.sdiv(b);
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.udiv(b);
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.srem(b);
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.urem(b);
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pb) {
-    CREATE(a)
-    CREATE(b)
-    return a.eq(b);
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb) {
-    CREATE(a)
-    CREATE(b)
-    return a.ne(b);
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb) {
-    CREATE(a)
-    CREATE(b)
-    return a.slt(b);
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb) {
-    CREATE(a)
-    CREATE(b)
-    return a.ult(b);
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb) {
-    CREATE(a)
-    CREATE(b)
-    return a.sle(b);
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb) {
-    CREATE(a)
-    CREATE(b)
-    return a.ule(b);
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a &= b;
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a |= b;
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a ^= b;
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.shl(b);
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.lshr(b);
-    ASSIGN(r, a)
-}
-extern "C" JL_DLLEXPORT
-void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.ashr(b);
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr) {
-    CREATE(a)
-    a.flipAllBits();
-    ASSIGN(r, a)
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    bool Overflow;
-    a = a.uadd_ov(b, Overflow);
-    ASSIGN(r, a)
-    return Overflow;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    bool Overflow;
-    a = a.sadd_ov(b, Overflow);
-    ASSIGN(r, a)
-    return Overflow;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    bool Overflow;
-    a = a.usub_ov(b, Overflow);
-    ASSIGN(r, a)
-    return Overflow;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    bool Overflow;
-    a = a.ssub_ov(b, Overflow);
-    ASSIGN(r, a)
-    return Overflow;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    bool Overflow;
-    a = a.smul_ov(b, Overflow);
-    ASSIGN(r, a)
-    return Overflow;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    bool Overflow;
-    a = a.umul_ov(b, Overflow);
-    ASSIGN(r, a)
-    return Overflow;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMDiv_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    bool Overflow;
-    a = a.sdiv_ov(b, Overflow);
-    ASSIGN(r, a)
-    return Overflow;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMDiv_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.udiv(b);
-    ASSIGN(r, a)
-    // unsigned division cannot overflow
-    return false;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMRem_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.srem(b);
-    ASSIGN(r, a)
-    // signed remainder cannot overflow
-    return false;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMRem_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    a = a.urem(b);
-    ASSIGN(r, a)
-    // unsigned remainder cannot overflow
-    return false;
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr) {
-    CREATE(a)
-    a = a.byteSwap();
-    ASSIGN(r, a)
-}
-
-void LLVMFPtoInt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr, bool isSigned, bool *isExact) {
-    double Val;
-    if (numbits == 32)
-        Val = *(float*)pa;
-    else if (numbits == 64)
-        Val = *(double*)pa;
-    else
-        jl_error("FPtoSI: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
-    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
-    if (onumbits <= 64) { // fast-path, if possible
-        if (isSigned) {
-            int64_t ia = Val;
-            memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian
-            if (isExact) {
-                // check whether the conversion was lossless
-                int64_t ia2 = ia < 0 ? -1 : 0;
-                memcpy(&ia2, pr, onumbytes);
-                *isExact = (Val == (double)ia2 && ia == ia2);
-            }
-        }
-        else {
-            uint64_t ia = Val;
-            memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian
-            if (isExact) {
-                // check whether the conversion was lossless
-                uint64_t ia2 = 0;
-                memcpy(&ia2, pr, onumbytes);
-                *isExact = (Val == (double)ia2 && ia == ia2);
-            }
-        }
+#define ASSIGN(r, a)                                                           \
+  if (numbits <= 8)                                                            \
+    *(uint8_t *)p##r = a.getZExtValue();                                       \
+  else if (numbits <= 16)                                                      \
+    *(uint16_t *)p##r = a.getZExtValue();                                      \
+  else if (numbits <= 32)                                                      \
+    *(uint32_t *)p##r = a.getZExtValue();                                      \
+  else if (numbits <= 64)                                                      \
+    *(uint64_t *)p##r = a.getZExtValue();                                      \
+  else                                                                         \
+    memcpy(p##r, a.getRawData(),                                               \
+           RoundUpToAlignment(numbits, host_char_bit) / host_char_bit);
+
+extern "C" JL_DLLEXPORT void LLVMNeg(unsigned numbits, integerPart *pa,
+                                     integerPart *pr) {
+  APInt z(numbits, 0);
+  CREATE(a)
+  z -= a;
+  ASSIGN(r, z)
+}
+
+extern "C" JL_DLLEXPORT void LLVMAdd(unsigned numbits, integerPart *pa,
+                                     integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a += b;
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMSub(unsigned numbits, integerPart *pa,
+                                     integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a -= b;
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMMul(unsigned numbits, integerPart *pa,
+                                     integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a *= b;
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMSDiv(unsigned numbits, integerPart *pa,
+                                      integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.sdiv(b);
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMUDiv(unsigned numbits, integerPart *pa,
+                                      integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.udiv(b);
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMSRem(unsigned numbits, integerPart *pa,
+                                      integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.srem(b);
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMURem(unsigned numbits, integerPart *pa,
+                                      integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.urem(b);
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT int LLVMICmpEQ(unsigned numbits, integerPart *pa,
+                                       integerPart *pb) {
+  CREATE(a)
+  CREATE(b)
+  return a.eq(b);
+}
+
+extern "C" JL_DLLEXPORT int LLVMICmpNE(unsigned numbits, integerPart *pa,
+                                       integerPart *pb) {
+  CREATE(a)
+  CREATE(b)
+  return a.ne(b);
+}
+
+extern "C" JL_DLLEXPORT int LLVMICmpSLT(unsigned numbits, integerPart *pa,
+                                        integerPart *pb) {
+  CREATE(a)
+  CREATE(b)
+  return a.slt(b);
+}
+
+extern "C" JL_DLLEXPORT int LLVMICmpULT(unsigned numbits, integerPart *pa,
+                                        integerPart *pb) {
+  CREATE(a)
+  CREATE(b)
+  return a.ult(b);
+}
+
+extern "C" JL_DLLEXPORT int LLVMICmpSLE(unsigned numbits, integerPart *pa,
+                                        integerPart *pb) {
+  CREATE(a)
+  CREATE(b)
+  return a.sle(b);
+}
+
+extern "C" JL_DLLEXPORT int LLVMICmpULE(unsigned numbits, integerPart *pa,
+                                        integerPart *pb) {
+  CREATE(a)
+  CREATE(b)
+  return a.ule(b);
+}
+
+extern "C" JL_DLLEXPORT void LLVMAnd(unsigned numbits, integerPart *pa,
+                                     integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a &= b;
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMOr(unsigned numbits, integerPart *pa,
+                                    integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a |= b;
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMXor(unsigned numbits, integerPart *pa,
+                                     integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a ^= b;
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMShl(unsigned numbits, integerPart *pa,
+                                     integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.shl(b);
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMLShr(unsigned numbits, integerPart *pa,
+                                      integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.lshr(b);
+  ASSIGN(r, a)
+}
+extern "C" JL_DLLEXPORT void LLVMAShr(unsigned numbits, integerPart *pa,
+                                      integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.ashr(b);
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT void LLVMFlipAllBits(unsigned numbits, integerPart *pa,
+                                             integerPart *pr) {
+  CREATE(a)
+  a.flipAllBits();
+  ASSIGN(r, a)
+}
+
+extern "C" JL_DLLEXPORT int LLVMAdd_uov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  bool Overflow;
+  a = a.uadd_ov(b, Overflow);
+  ASSIGN(r, a)
+  return Overflow;
+}
+
+extern "C" JL_DLLEXPORT int LLVMAdd_sov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  bool Overflow;
+  a = a.sadd_ov(b, Overflow);
+  ASSIGN(r, a)
+  return Overflow;
+}
+
+extern "C" JL_DLLEXPORT int LLVMSub_uov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  bool Overflow;
+  a = a.usub_ov(b, Overflow);
+  ASSIGN(r, a)
+  return Overflow;
+}
+
+extern "C" JL_DLLEXPORT int LLVMSub_sov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  bool Overflow;
+  a = a.ssub_ov(b, Overflow);
+  ASSIGN(r, a)
+  return Overflow;
+}
+
+extern "C" JL_DLLEXPORT int LLVMMul_sov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  bool Overflow;
+  a = a.smul_ov(b, Overflow);
+  ASSIGN(r, a)
+  return Overflow;
+}
+
+extern "C" JL_DLLEXPORT int LLVMMul_uov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  bool Overflow;
+  a = a.umul_ov(b, Overflow);
+  ASSIGN(r, a)
+  return Overflow;
+}
+
+extern "C" JL_DLLEXPORT int LLVMDiv_sov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  bool Overflow;
+  a = a.sdiv_ov(b, Overflow);
+  ASSIGN(r, a)
+  return Overflow;
+}
+
+extern "C" JL_DLLEXPORT int LLVMDiv_uov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.udiv(b);
+  ASSIGN(r, a)
+  // unsigned division cannot overflow
+  return false;
+}
+
+extern "C" JL_DLLEXPORT int LLVMRem_sov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.srem(b);
+  ASSIGN(r, a)
+  // signed remainder cannot overflow
+  return false;
+}
+
+extern "C" JL_DLLEXPORT int LLVMRem_uov(unsigned numbits, integerPart *pa,
+                                        integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  a = a.urem(b);
+  ASSIGN(r, a)
+  // unsigned remainder cannot overflow
+  return false;
+}
+
+extern "C" JL_DLLEXPORT void LLVMByteSwap(unsigned numbits, integerPart *pa,
+                                          integerPart *pr) {
+  CREATE(a)
+  a = a.byteSwap();
+  ASSIGN(r, a)
+}
+
+void LLVMFPtoInt(unsigned numbits, integerPart *pa, unsigned onumbits,
+                 integerPart *pr, bool isSigned, bool *isExact) {
+  double Val;
+  if (numbits == 32)
+    Val = *(float *)pa;
+  else if (numbits == 64)
+    Val = *(double *)pa;
+  else
+    jl_error("FPtoSI: runtime floating point intrinsics are not implemented "
+             "for bit sizes other than 32 and 64");
+  unsigned onumbytes =
+      RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+  if (onumbits <= 64) { // fast-path, if possible
+    if (isSigned) {
+      int64_t ia = Val;
+      memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian
+      if (isExact) {
+        // check whether the conversion was lossless
+        int64_t ia2 = ia < 0 ? -1 : 0;
+        memcpy(&ia2, pr, onumbytes);
+        *isExact = (Val == (double)ia2 && ia == ia2);
+      }
+    } else {
+      uint64_t ia = Val;
+      memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian
+      if (isExact) {
+        // check whether the conversion was lossless
+        uint64_t ia2 = 0;
+        memcpy(&ia2, pr, onumbytes);
+        *isExact = (Val == (double)ia2 && ia == ia2);
+      }
     }
-    else {
-        APFloat a(Val);
-        bool isVeryExact;
-        APFloat::roundingMode rounding_mode = APFloat::rmNearestTiesToEven;
-        unsigned nbytes = RoundUpToAlignment(onumbits, integerPartWidth) / host_char_bit;
-        integerPart *parts = (integerPart*)alloca(nbytes);
-        APFloat::opStatus status = a.convertToInteger(parts, onumbits, isSigned, rounding_mode, &isVeryExact);
-        memcpy(pr, parts, onumbytes);
-        if (isExact)
-            *isExact = (status == APFloat::opOK);
-    }
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    LLVMFPtoInt(numbits, pa, onumbits, pr, true, NULL);
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    LLVMFPtoInt(numbits, pa, onumbits, pr, false, NULL);
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    bool isExact;
-    LLVMFPtoInt(numbits, pa, onumbits, pr, true, &isExact);
-    return isExact;
-}
-
-extern "C" JL_DLLEXPORT
-int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    bool isExact;
-    LLVMFPtoInt(numbits, pa, onumbits, pr, false, &isExact);
-    return isExact;
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    CREATE(a)
-    double val = a.roundToDouble(true);
-    if (onumbits == 32)
-        *(float*)pr = val;
-    else if (onumbits == 64)
-        *(double*)pr = val;
-    else
-        jl_error("SItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    CREATE(a)
-    double val = a.roundToDouble(false);
-    if (onumbits == 32)
-        *(float*)pr = val;
-    else if (onumbits == 64)
-        *(double*)pr = val;
-    else
-        jl_error("UItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMSExt(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    assert(inumbits < onumbits);
-    unsigned inumbytes = RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit;
-    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
-    int bits = (0 - inumbits) % host_char_bit;
-    int signbit = (inumbits - 1) % host_char_bit;
-    int sign = ((unsigned char*)pa)[inumbytes - 1] & (1 << signbit) ? -1 : 0;
-    // copy over the input bytes
-    memcpy(pr, pa, inumbytes);
-    if (bits) {
-        // sign-extend the partial byte
-        ((signed char*)pr)[inumbytes - 1] = ((signed char*)pa)[inumbytes - 1] << bits >> bits;
-    }
-    // sign-extend the rest of the bytes
-    memset((char*)pr + inumbytes, sign, onumbytes - inumbytes);
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMZExt(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    assert(inumbits < onumbits);
-    unsigned inumbytes = RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit;
-    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
-    int bits = (0 - inumbits) % host_char_bit;
-    // copy over the input bytes
-    memcpy(pr, pa, inumbytes);
-    if (bits) {
-        // zero the remaining bits of the partial byte
-        ((unsigned char*)pr)[inumbytes - 1] = ((unsigned char*)pa)[inumbytes - 1] << bits >> bits;
-    }
-    // zero-extend the rest of the bytes
-    memset((char*)pr + inumbytes, 0, onumbytes - inumbytes);
-}
-
-extern "C" JL_DLLEXPORT
-void LLVMTrunc(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
-    assert(inumbits > onumbits);
-    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
-    memcpy(pr, pa, onumbytes);
-}
-
-extern "C" JL_DLLEXPORT
-unsigned countTrailingZeros_8(uint8_t Val) {
+  } else {
+    APFloat a(Val);
+    bool isVeryExact;
+    APFloat::roundingMode rounding_mode = APFloat::rmNearestTiesToEven;
+    unsigned nbytes =
+        RoundUpToAlignment(onumbits, integerPartWidth) / host_char_bit;
+    integerPart *parts = (integerPart *)alloca(nbytes);
+    APFloat::opStatus status = a.convertToInteger(parts, onumbits, isSigned,
+                                                  rounding_mode, &isVeryExact);
+    memcpy(pr, parts, onumbytes);
+    if (isExact)
+      *isExact = (status == APFloat::opOK);
+  }
+}
+
+extern "C" JL_DLLEXPORT void LLVMFPtoSI(unsigned numbits, integerPart *pa,
+                                        unsigned onumbits, integerPart *pr) {
+  LLVMFPtoInt(numbits, pa, onumbits, pr, true, NULL);
+}
+
+extern "C" JL_DLLEXPORT void LLVMFPtoUI(unsigned numbits, integerPart *pa,
+                                        unsigned onumbits, integerPart *pr) {
+  LLVMFPtoInt(numbits, pa, onumbits, pr, false, NULL);
+}
+
+extern "C" JL_DLLEXPORT int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa,
+                                             unsigned onumbits,
+                                             integerPart *pr) {
+  bool isExact;
+  LLVMFPtoInt(numbits, pa, onumbits, pr, true, &isExact);
+  return isExact;
+}
+
+extern "C" JL_DLLEXPORT int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa,
+                                             unsigned onumbits,
+                                             integerPart *pr) {
+  bool isExact;
+  LLVMFPtoInt(numbits, pa, onumbits, pr, false, &isExact);
+  return isExact;
+}
+
+extern "C" JL_DLLEXPORT void LLVMSItoFP(unsigned numbits, integerPart *pa,
+                                        unsigned onumbits, integerPart *pr) {
+  CREATE(a)
+  double val = a.roundToDouble(true);
+  if (onumbits == 32)
+    *(float *)pr = val;
+  else if (onumbits == 64)
+    *(double *)pr = val;
+  else
+    jl_error("SItoFP: runtime floating point intrinsics are not implemented "
+             "for bit sizes other than 32 and 64");
+}
+
+extern "C" JL_DLLEXPORT void LLVMUItoFP(unsigned numbits, integerPart *pa,
+                                        unsigned onumbits, integerPart *pr) {
+  CREATE(a)
+  double val = a.roundToDouble(false);
+  if (onumbits == 32)
+    *(float *)pr = val;
+  else if (onumbits == 64)
+    *(double *)pr = val;
+  else
+    jl_error("UItoFP: runtime floating point intrinsics are not implemented "
+             "for bit sizes other than 32 and 64");
+}
+
+extern "C" JL_DLLEXPORT void LLVMSExt(unsigned inumbits, integerPart *pa,
+                                      unsigned onumbits, integerPart *pr) {
+  assert(inumbits < onumbits);
+  unsigned inumbytes =
+      RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit;
+  unsigned onumbytes =
+      RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+  int bits = (0 - inumbits) % host_char_bit;
+  int signbit = (inumbits - 1) % host_char_bit;
+  int sign = ((unsigned char *)pa)[inumbytes - 1] & (1 << signbit) ? -1 : 0;
+  // copy over the input bytes
+  memcpy(pr, pa, inumbytes);
+  if (bits) {
+    // sign-extend the partial byte
+    ((signed char *)pr)[inumbytes - 1] =
+        ((signed char *)pa)[inumbytes - 1] << bits >> bits;
+  }
+  // sign-extend the rest of the bytes
+  memset((char *)pr + inumbytes, sign, onumbytes - inumbytes);
+}
+
+extern "C" JL_DLLEXPORT void LLVMZExt(unsigned inumbits, integerPart *pa,
+                                      unsigned onumbits, integerPart *pr) {
+  assert(inumbits < onumbits);
+  unsigned inumbytes =
+      RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit;
+  unsigned onumbytes =
+      RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+  int bits = (0 - inumbits) % host_char_bit;
+  // copy over the input bytes
+  memcpy(pr, pa, inumbytes);
+  if (bits) {
+    // zero the remaining bits of the partial byte
+    ((unsigned char *)pr)[inumbytes - 1] =
+        ((unsigned char *)pa)[inumbytes - 1] << bits >> bits;
+  }
+  // zero-extend the rest of the bytes
+  memset((char *)pr + inumbytes, 0, onumbytes - inumbytes);
+}
+
+extern "C" JL_DLLEXPORT void LLVMTrunc(unsigned inumbits, integerPart *pa,
+                                       unsigned onumbits, integerPart *pr) {
+  assert(inumbits > onumbits);
+  unsigned onumbytes =
+      RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+  memcpy(pr, pa, onumbytes);
+}
+
+extern "C" JL_DLLEXPORT unsigned countTrailingZeros_8(uint8_t Val) {
 #if JL_LLVM_VERSION >= 30500
-    return countTrailingZeros(Val);
+  return countTrailingZeros(Val);
 #else
-    return CountTrailingZeros_32(Val);
+  return CountTrailingZeros_32(Val);
 #endif
 }
 
-extern "C" JL_DLLEXPORT
-unsigned countTrailingZeros_16(uint16_t Val) {
+extern "C" JL_DLLEXPORT unsigned countTrailingZeros_16(uint16_t Val) {
 #if JL_LLVM_VERSION >= 30500
-    return countTrailingZeros(Val);
+  return countTrailingZeros(Val);
 #else
-    return CountTrailingZeros_32(Val);
+  return CountTrailingZeros_32(Val);
 #endif
 }
 
-extern "C" JL_DLLEXPORT
-unsigned countTrailingZeros_32(uint32_t Val) {
+extern "C" JL_DLLEXPORT unsigned countTrailingZeros_32(uint32_t Val) {
 #if JL_LLVM_VERSION >= 30500
-    return countTrailingZeros(Val);
+  return countTrailingZeros(Val);
 #else
-    return CountTrailingZeros_32(Val);
+  return CountTrailingZeros_32(Val);
 #endif
 }
 
-extern "C" JL_DLLEXPORT
-unsigned countTrailingZeros_64(uint64_t Val) {
+extern "C" JL_DLLEXPORT unsigned countTrailingZeros_64(uint64_t Val) {
 #if JL_LLVM_VERSION >= 30500
-    return countTrailingZeros(Val);
+  return countTrailingZeros(Val);
 #else
-    return CountTrailingZeros_64(Val);
+  return CountTrailingZeros_64(Val);
 #endif
 }
 
-extern "C" JL_DLLEXPORT
-void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    CREATE(a)
-    CREATE(b)
-    APInt r = a.srem(b);
-    if (a.isNegative() != b.isNegative()) {
-        r = (b + r).srem(b);
-    }
-    ASSIGN(r, r)
+extern "C" JL_DLLEXPORT void jl_LLVMSMod(unsigned numbits, integerPart *pa,
+                                         integerPart *pb, integerPart *pr) {
+  CREATE(a)
+  CREATE(b)
+  APInt r = a.srem(b);
+  if (a.isNegative() != b.isNegative()) {
+    r = (b + r).srem(b);
+  }
+  ASSIGN(r, r)
 }
 
-extern "C" JL_DLLEXPORT
-void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
-    unsigned numbytes = RoundUpToAlignment(numbits, host_char_bit) / host_char_bit;
-    int signbit = (numbits - 1) % host_char_bit;
-    int sign = ((unsigned char*)pa)[numbytes - 1] & (1 << signbit) ? -1 : 0;
-    if (sign)
-        LLVMNeg(numbits, pa, pr);
-    else
-        memcpy(pr, pa,  numbytes);
+extern "C" JL_DLLEXPORT void jl_LLVMFlipSign(unsigned numbits, integerPart *pa,
+                                             integerPart *pb, integerPart *pr) {
+  unsigned numbytes =
+      RoundUpToAlignment(numbits, host_char_bit) / host_char_bit;
+  int signbit = (numbits - 1) % host_char_bit;
+  int sign = ((unsigned char *)pa)[numbytes - 1] & (1 << signbit) ? -1 : 0;
+  if (sign)
+    LLVMNeg(numbits, pa, pr);
+  else
+    memcpy(pr, pa, numbytes);
 }
 
-extern "C" JL_DLLEXPORT
-unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa) {
-    CREATE(a)
-    return a.countPopulation();
+extern "C" JL_DLLEXPORT unsigned LLVMCountPopulation(unsigned numbits,
+                                                     integerPart *pa) {
+  CREATE(a)
+  return a.countPopulation();
 }
 
-extern "C" JL_DLLEXPORT
-unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa) {
-    CREATE(a)
-    return a.countTrailingOnes();
+extern "C" JL_DLLEXPORT unsigned LLVMCountTrailingOnes(unsigned numbits,
+                                                       integerPart *pa) {
+  CREATE(a)
+  return a.countTrailingOnes();
 }
 
-extern "C" JL_DLLEXPORT
-unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa) {
-    CREATE(a)
-    return a.countTrailingZeros();
+extern "C" JL_DLLEXPORT unsigned LLVMCountTrailingZeros(unsigned numbits,
+                                                        integerPart *pa) {
+  CREATE(a)
+  return a.countTrailingZeros();
 }
 
-extern "C" JL_DLLEXPORT
-unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa) {
-    CREATE(a)
-    return a.countLeadingOnes();
+extern "C" JL_DLLEXPORT unsigned LLVMCountLeadingOnes(unsigned numbits,
+                                                      integerPart *pa) {
+  CREATE(a)
+  return a.countLeadingOnes();
 }
 
-extern "C" JL_DLLEXPORT
-unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa) {
-    CREATE(a)
-    return a.countLeadingZeros();
+extern "C" JL_DLLEXPORT unsigned LLVMCountLeadingZeros(unsigned numbits,
+                                                       integerPart *pa) {
+  CREATE(a)
+  return a.countLeadingZeros();
 }
\ No newline at end of file
diff --git a/hpvm/projects/llvm-cbe/test/APInt-C.h b/hpvm/projects/llvm-cbe/test/APInt-C.h
index 793bc123003f81308a28c9ff154b4f6fd77b9ab2..873227caf38926f24e7529d62c8110d5fa2d64ef 100644
--- a/hpvm/projects/llvm-cbe/test/APInt-C.h
+++ b/hpvm/projects/llvm-cbe/test/APInt-C.h
@@ -15,41 +15,70 @@ typedef void integerPart;
 #endif
 
 JL_DLLEXPORT void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr);
-JL_DLLEXPORT void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr);
-
-JL_DLLEXPORT void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-
-JL_DLLEXPORT void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr);
+JL_DLLEXPORT void LLVMByteSwap(unsigned numbits, integerPart *pa,
+                               integerPart *pr);
+
+JL_DLLEXPORT void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb,
+                          integerPart *pr);
+JL_DLLEXPORT void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb,
+                          integerPart *pr);
+JL_DLLEXPORT void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb,
+                          integerPart *pr);
+JL_DLLEXPORT void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb,
+                           integerPart *pr);
+JL_DLLEXPORT void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb,
+                           integerPart *pr);
+JL_DLLEXPORT void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb,
+                           integerPart *pr);
+JL_DLLEXPORT void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb,
+                           integerPart *pr);
+
+JL_DLLEXPORT void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb,
+                          integerPart *pr);
+JL_DLLEXPORT void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb,
+                         integerPart *pr);
+JL_DLLEXPORT void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb,
+                          integerPart *pr);
+JL_DLLEXPORT void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb,
+                          integerPart *pr);
+JL_DLLEXPORT void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb,
+                           integerPart *pr);
+JL_DLLEXPORT void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb,
+                           integerPart *pr);
+JL_DLLEXPORT void LLVMFlipAllBits(unsigned numbits, integerPart *pa,
+                                  integerPart *pr);
 
 JL_DLLEXPORT int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pr);
 JL_DLLEXPORT int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb);
-JL_DLLEXPORT int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb);
-JL_DLLEXPORT int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb);
-JL_DLLEXPORT int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb);
-JL_DLLEXPORT int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb);
-
-JL_DLLEXPORT int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMDiv_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMDiv_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMRem_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT int LLVMRem_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+JL_DLLEXPORT int LLVMICmpSLT(unsigned numbits, integerPart *pa,
+                             integerPart *pb);
+JL_DLLEXPORT int LLVMICmpULT(unsigned numbits, integerPart *pa,
+                             integerPart *pb);
+JL_DLLEXPORT int LLVMICmpSLE(unsigned numbits, integerPart *pa,
+                             integerPart *pb);
+JL_DLLEXPORT int LLVMICmpULE(unsigned numbits, integerPart *pa,
+                             integerPart *pb);
+
+JL_DLLEXPORT int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMDiv_sov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMDiv_uov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMRem_sov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
+JL_DLLEXPORT int LLVMRem_uov(unsigned numbits, integerPart *pa, integerPart *pb,
+                             integerPart *pr);
 
 JL_DLLEXPORT unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa);
 JL_DLLEXPORT unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa);
@@ -57,30 +86,40 @@ JL_DLLEXPORT unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa);
 JL_DLLEXPORT unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa);
 JL_DLLEXPORT unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa);
 
-JL_DLLEXPORT void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-JL_DLLEXPORT void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-JL_DLLEXPORT void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-JL_DLLEXPORT void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-JL_DLLEXPORT void LLVMSExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-JL_DLLEXPORT void LLVMZExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-JL_DLLEXPORT void LLVMTrunc(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-
-JL_DLLEXPORT int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-JL_DLLEXPORT int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-
-JL_DLLEXPORT void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-JL_DLLEXPORT void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+JL_DLLEXPORT void LLVMFPtoSI(unsigned numbits, integerPart *pa,
+                             unsigned onumbits, integerPart *pr);
+JL_DLLEXPORT void LLVMFPtoUI(unsigned numbits, integerPart *pa,
+                             unsigned onumbits, integerPart *pr);
+JL_DLLEXPORT void LLVMSItoFP(unsigned numbits, integerPart *pa,
+                             unsigned onumbits, integerPart *pr);
+JL_DLLEXPORT void LLVMUItoFP(unsigned numbits, integerPart *pa,
+                             unsigned onumbits, integerPart *pr);
+JL_DLLEXPORT void LLVMSExt(unsigned numbits, integerPart *pa, unsigned onumbits,
+                           integerPart *pr);
+JL_DLLEXPORT void LLVMZExt(unsigned numbits, integerPart *pa, unsigned onumbits,
+                           integerPart *pr);
+JL_DLLEXPORT void LLVMTrunc(unsigned numbits, integerPart *pa,
+                            unsigned onumbits, integerPart *pr);
+
+JL_DLLEXPORT int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa,
+                                  unsigned onumbits, integerPart *pr);
+JL_DLLEXPORT int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa,
+                                  unsigned onumbits, integerPart *pr);
+
+JL_DLLEXPORT void jl_LLVMSMod(unsigned numbits, integerPart *pa,
+                              integerPart *pb, integerPart *pr);
+JL_DLLEXPORT void jl_LLVMFlipSign(unsigned numbits, integerPart *pa,
+                                  integerPart *pb, integerPart *pr);
 
 JL_DLLEXPORT unsigned countTrailingZeros_8(uint8_t Val);
 JL_DLLEXPORT unsigned countTrailingZeros_16(uint16_t Val);
 JL_DLLEXPORT unsigned countTrailingZeros_32(uint32_t Val);
 JL_DLLEXPORT unsigned countTrailingZeros_64(uint64_t Val);
 
-//uint8_t getSwappedBytes_8(uint8_t Value); // no-op
-//uint16_t getSwappedBytes_16(uint16_t Value);
-//uint32_t getSwappedBytes_32(uint32_t Value);
-//uint64_t getSwappedBytes_64(uint64_t Value);
-
+// uint8_t getSwappedBytes_8(uint8_t Value); // no-op
+// uint16_t getSwappedBytes_16(uint16_t Value);
+// uint32_t getSwappedBytes_32(uint32_t Value);
+// uint64_t getSwappedBytes_64(uint64_t Value);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test001.c b/hpvm/projects/llvm-cbe/test/cfiles/test001.c
index 817d7ca8cae09d11e57848ee7d3fdb9a7931d19a..8606d141ba73ddce2a598e85c6a787d715b1a5e2 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test001.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test001.c
@@ -11,7 +11,4 @@
 //
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    return 6;
-}
+int main() { return 6; }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test002.c b/hpvm/projects/llvm-cbe/test/cfiles/test002.c
index 9af3c34ee82cf9517f0f4ed4015a239fdace5cfb..aeb02526f8b2bda1b0bae293d1f006c6a4622641 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test002.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test002.c
@@ -8,14 +8,13 @@
 //===----------------------------------------------------------------------===//
 //
 // This code tests to see that the CBE will execute a for loop correctly.
-// *TW 
+// *TW
 //
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    int i, x = 0;
-    for (i = 0; i < 6; i++)
-        ++x;
-    return x;
+int main() {
+  int i, x = 0;
+  for (i = 0; i < 6; i++)
+    ++x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test003.c b/hpvm/projects/llvm-cbe/test/cfiles/test003.c
index 4aa8eb6bfb6e4a4e4d67f5fd4f5847cc608d6cae..bfeaef5db7a85f23c746b90f17461fb10dfd87e8 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test003.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test003.c
@@ -11,13 +11,11 @@
 // *TW
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 0, x = 0;
-    while (i < 6) {
-        ++x;
-        ++i;
-    }
-    return x;
-} 
-   
+int main() {
+  int i = 0, x = 0;
+  while (i < 6) {
+    ++x;
+    ++i;
+  }
+  return x;
+}
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test004.c b/hpvm/projects/llvm-cbe/test/cfiles/test004.c
index ba619f09bbaab461723a6f85dca1dfbb28ceac41..35a5a02d83091093a1b251bbb5a7158b11d93244 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test004.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test004.c
@@ -7,15 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute an if/else statement correctly.
-// *TW
+// This code tests to see that the CBE will execute an if/else statement
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int x = 3;
-    x += 3;
-    if (x == 6)
-        return x;
-    else
-        return 0;
+  int x = 3;
+  x += 3;
+  if (x == 6)
+    return x;
+  else
+    return 0;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test005.c b/hpvm/projects/llvm-cbe/test/cfiles/test005.c
index 8b9323a97e3a27cfb4cc45b17ba26b39c96a180c..a287f075cd3b152e84a0bd24ce35097c5bb231b7 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test005.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test005.c
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int i, j, x = 0;
-    for (i = 0; i < 3; i++)
-        for (j = 0; j < 2; j++)
-            ++x;
-            
-    return x;
+  int i, j, x = 0;
+  for (i = 0; i < 3; i++)
+    for (j = 0; j < 2; j++)
+      ++x;
+
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test006.c b/hpvm/projects/llvm-cbe/test/cfiles/test006.c
index b513d75d4ab163388f15f156d1387d5b71dfcdf4..fe901d6d19cd2dabd11f66623d1f1ca3d0cf55b9 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test006.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test006.c
@@ -7,18 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a nested while loop correctly.
-// *TW
+// This code tests to see that the CBE will execute a nested while loop
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int i = 0, j = 0, x = 0;
-    while (i < 6) {
-        while (j < 6) {
-            ++x;
-            ++j;
-        }
-      ++i;
+  int i = 0, j = 0, x = 0;
+  while (i < 6) {
+    while (j < 6) {
+      ++x;
+      ++j;
     }
-    return x;
+    ++i;
+  }
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test007.c b/hpvm/projects/llvm-cbe/test/cfiles/test007.c
index 50c895d18192844c38c53c6e706eb2c4f163713d..b4ff4365db7ad0f48dc9fa2171a818757ba899c1 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test007.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test007.c
@@ -7,27 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a switch statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a switch statement
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-   char var = 'x';
-   
-   switch (var) {
-      case 'z' :
-         return 0;
-         break;
-      case 'y' :
-         return 1;
-         break;
-      case 'x' :
-         return 6;
-         break;
-      case 'w' :
-         return 7;
-         break;
-      default :
-         return 100;
-   }
+  char var = 'x';
+
+  switch (var) {
+  case 'z':
+    return 0;
+    break;
+  case 'y':
+    return 1;
+    break;
+  case 'x':
+    return 6;
+    break;
+  case 'w':
+    return 7;
+    break;
+  default:
+    return 100;
+  }
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test008.c b/hpvm/projects/llvm-cbe/test/cfiles/test008.c
index 283b8f73bafe45c6225e5270249c458b9a75a80d..f054263e0b5490d25b16c53c082d7b0dfbd1793f 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test008.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test008.c
@@ -12,18 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 struct test {
-   int var1;
-   int var2;
-   int var3;
+  int var1;
+  int var2;
+  int var3;
 };
 
 int main() {
 
-   struct test variable;
+  struct test variable;
 
-   variable.var2 = 5;
-   variable.var3 = 6;
-   variable.var1 = 9;
-    
-   return variable.var3;
+  variable.var2 = 5;
+  variable.var3 = 6;
+  variable.var1 = 9;
+
+  return variable.var3;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test009.c b/hpvm/projects/llvm-cbe/test/cfiles/test009.c
index a46509105cb73430794e55eb9d5af6d0da98ff6f..1b2fc327e2c7fd67ba1520dbecebd4803507c600 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test009.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test009.c
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int example[10];
-    int i;
-       for (i = 0;i < 10; ++i) {
-        example[i] = i;
-       }
-       return example[6];
+  int example[10];
+  int i;
+  for (i = 0; i < 10; ++i) {
+    example[i] = i;
+  }
+  return example[6];
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test010.c b/hpvm/projects/llvm-cbe/test/cfiles/test010.c
index e3841e64d3e41aa923201427f6913c3e30a650c9..21c6fdd0c7b6ed0a6c346d01a8e8836a4b2050a5 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test010.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test010.c
@@ -7,37 +7,37 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a nested switch statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a nested switch statement
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-   char var = 'x', var2;
-   switch (var) {
-      case 'z' :
-         return 0;
-         break;
-      case 'y' :
-         return 1;
-         break;
-      case 'x' :
-         var2 = 'b';
-    
-         switch (var2) {
-            case 'a' :
-               return 10;
-               break;
-            case 'b' :
-               return 6;
-               break;
-            default :
-               return 18;
-         }
+  char var = 'x', var2;
+  switch (var) {
+  case 'z':
+    return 0;
+    break;
+  case 'y':
+    return 1;
+    break;
+  case 'x':
+    var2 = 'b';
 
-         case 'w' :
-            return 7;
-            break;
-         default :
-            return 100;
-   }
+    switch (var2) {
+    case 'a':
+      return 10;
+      break;
+    case 'b':
+      return 6;
+      break;
+    default:
+      return 18;
+    }
+
+  case 'w':
+    return 7;
+    break;
+  default:
+    return 100;
+  }
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test011.c b/hpvm/projects/llvm-cbe/test/cfiles/test011.c
index aa0ee7229f512c25e2794372eac697c85d35b531..9ff808b7096c728794ed472349b472d5ce61b952 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test011.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test011.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,13 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle addition between two variables.
-// *TW
+// This code tests to see that the CBE can handle addition between two
+// variables. *TW
 //===------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 2, t = 4, x = 0;
-    x = i+t;
+int main() {
+  int i = 2, t = 4, x = 0;
+  x = i + t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test012.c b/hpvm/projects/llvm-cbe/test/cfiles/test012.c
index 403c635686a51eb493c0ca224c043b6aa6c2fce6..60689156c5bcd5c835aebb0a9c5e0e8d7612d164 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test012.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test012.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,14 @@
 //
 //===----------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle subtraction between two variables.
+// This code tests to see that the CBE can handle subtraction between two
+// variables.
 //  *TW
 //===----------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 8, t = 2, x = 0;
-    x = i-t;
+int main() {
+  int i = 8, t = 2, x = 0;
+  x = i - t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test013.c b/hpvm/projects/llvm-cbe/test/cfiles/test013.c
index 444d4676b78a2f5324cb9bbccfac67bfcf9330aa..9bb5dc492bc251f11c152eb2ea7b506c3354430c 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test013.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test013.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,13 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle multiplication between two variables.
-// *TW
+// This code tests to see that the CBE can handle multiplication between two
+// variables. *TW
 //===------------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 3, t = 2, x = 0;
-    x = i*t;
+int main() {
+  int i = 3, t = 2, x = 0;
+  x = i * t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test014.c b/hpvm/projects/llvm-cbe/test/cfiles/test014.c
index e1dc6931f9e989ed867f8abf93dfa5f57042de5c..cbc0ad52d407bfc768a56b7105d4b93a7d2bdaf7 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test014.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test014.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,13 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle division between two variables.
-// *TW
+// This code tests to see that the CBE can handle division between two
+// variables. *TW
 //===------------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 30, t = 5, x = 0;
-    x = i/t;
+int main() {
+  int i = 30, t = 5, x = 0;
+  x = i / t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test015.c b/hpvm/projects/llvm-cbe/test/cfiles/test015.c
index e4c2a5c03b28ca481dd3709e18567237cc12a660..81c2f22808e4f4efcb7a4d031faf6a7e2e197f37 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test015.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test015.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,10 +12,9 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 26, t = 20, x = 0;
-    x = i%t;
+int main() {
+  int i = 26, t = 20, x = 0;
+  x = i % t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test016.c b/hpvm/projects/llvm-cbe/test/cfiles/test016.c
index 0841840ebc31ba622a4538f328b658b0bf52e08c..bb5bc64fff2b798375e2c2470e6538d5009c7719 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test016.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test016.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +13,12 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    char ch;
+  char ch;
 
-    if(sizeof(+ch) == 4) {
-        return 6;
-    }
-    return 1;
+  if (sizeof(+ch) == 4) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test017.c b/hpvm/projects/llvm-cbe/test/cfiles/test017.c
index 0535862b3057ea1cf7ac7ba2801a563a85d75dbe..a87abcd1e8f3311be495deb7bcf369f01ceeaa7f 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test017.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test017.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,11 +14,11 @@
 
 int main() {
 
-    signed int a = 10;
-    signed int b = -a;
+  signed int a = 10;
+  signed int b = -a;
 
-    if(b == -10) {
-        return 6;
-    }
-        return 1;
+  if (b == -10) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test018.c b/hpvm/projects/llvm-cbe/test/cfiles/test018.c
index c02efa9d0e914b96a6d491769e1a22e2e2747047..ea38b291393f20192f1885bdd702ef321b6929f0 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test018.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test018.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,15 +8,15 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle the incremental (++a) operator.
-// *TW
+// This code tests to see that the CBE can handle the incremental (++a)
+// operator. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 5;
+  int x = 5;
 
-    ++x;
+  ++x;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test019.c b/hpvm/projects/llvm-cbe/test/cfiles/test019.c
index 1975bb9c5b3e0eaae7a1417310da435f3df8a0d6..484fe0481656cba546bee1565e110f1a0dc90327 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test019.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test019.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,15 +8,15 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle the decremental (--a) operator.
-// *TW
+// This code tests to see that the CBE can handle the decremental (--a)
+// operator. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 7;
-   
-    --x;
+  int x = 7;
 
-    return x;
+  --x;
+
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test020.c b/hpvm/projects/llvm-cbe/test/cfiles/test020.c
index a68801708d9b628dae8dc3b5dba130f86436bdb6..98ed7f1701cdfdf442e6253706f2bc2f1f30227f 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test020.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test020.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 3;
+  int x = 6;
+  int y = 3;
 
-    if(x > y){
-        return x;
-    }
-    return 1;
+  if (x > y) {
+    return x;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test021.c b/hpvm/projects/llvm-cbe/test/cfiles/test021.c
index 93eed31d9bbfc11ad0559dcaf649df9e1f9206c1..0c5e63a462482f6b2f5cc392c2508a3076c3c3e6 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test021.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test021.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
+  int x = 6;
+  int y = 6;
 
-    if(x >= y){
-        return x;
-    }
-    return 1;
+  if (x >= y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test022.c b/hpvm/projects/llvm-cbe/test/cfiles/test022.c
index 895069a83bc7b1c1df9d6f27784187940b493b35..1578e158914dd68f5b99c4c69b36e19f23217939 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test022.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test022.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 12;
+  int x = 6;
+  int y = 12;
 
-
-    if(x < y){
-        return x;
-    }
-    return 1;
+  if (x < y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test023.c b/hpvm/projects/llvm-cbe/test/cfiles/test023.c
index 52348d3e1690624aa712ec6735e811f4ab958055..bc309ddb015a9af75cfcd3f09a7c5f5e093ba981 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test023.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test023.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
+  int x = 6;
+  int y = 6;
 
-    if(x <= y){
-        return x;
-    }
-    return 1;
+  if (x <= y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test024.c b/hpvm/projects/llvm-cbe/test/cfiles/test024.c
index 2c90879b87e3646db441c47e114b04f05de134a8..782d41a47880e3078af3d3775923870efba0a915 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test024.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test024.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
+  int x = 6;
+  int y = 6;
 
-    if(x == y){
-        return x;
-    }
-    return 1;
+  if (x == y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test025.c b/hpvm/projects/llvm-cbe/test/cfiles/test025.c
index 153cb4013477a27ad95248d38c62aa45bb2d5206..26bedf78ca25c8da2a1ba9c12694ddbdba087033 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test025.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test025.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 2;
+  int x = 6;
+  int y = 2;
 
-    if(x != y){
-        return x;
-    }
-    return 1;
+  if (x != y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test026.c b/hpvm/projects/llvm-cbe/test/cfiles/test026.c
index 874c06957d200d2402974e3928aae339f5c2d16c..cf0b3e6ae94f24c8392a7b6f91a7dade1c1a6613 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test026.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test026.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-int x = 6;
-int y = 6;
-int z = 6;
+  int x = 6;
+  int y = 6;
+  int z = 6;
 
-    if(x == y && x == z){
-        return 6;
-    }
-    return 1;
+  if (x == y && x == z) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test027.c b/hpvm/projects/llvm-cbe/test/cfiles/test027.c
index d1322597c34d5d3d284ae8d4de203b4bc769f998..f1e0adb31dc38ed4d35a25590281e3a9ac505474 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test027.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test027.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
-    int z = 6;
+  int x = 6;
+  int y = 6;
+  int z = 6;
 
-    if(x == y || x != z){
-        return 6;
-    }
-    return 1;
+  if (x == y || x != z) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test028.c b/hpvm/projects/llvm-cbe/test/cfiles/test028.c
index ce77d792f3b2e75d3795784a3a932c37c120c764..7e2ecdcf3f66c4637a15b2d2d19ddd3b5e740469 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test028.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test028.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = -7;
-    unsigned int b = 0;
+  unsigned int a = -7;
+  unsigned int b = 0;
 
-    b = ~a;
-    if( b == 6){
-        return 6;
-    }
-    return 1;
+  b = ~a;
+  if (b == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test029.c b/hpvm/projects/llvm-cbe/test/cfiles/test029.c
index b7ac93ecf5f275ba5129a9bed6988b694cc0ca39..34d1ff5c8be474ccc629e9d10c51ca30b5cb8c10 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test029.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test029.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,15 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 6;  //0110
-    unsigned int b = 15; //1111
-    unsigned int c = 0;
+  unsigned int a = 6;  // 0110
+  unsigned int b = 15; // 1111
+  unsigned int c = 0;
 
-    c = a&b;
-    if(c == 6){
-        return 6;
-    }
-    return 1;
+  c = a & b;
+  if (c == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test030.c b/hpvm/projects/llvm-cbe/test/cfiles/test030.c
index 333ce5aa01915623200c600249ebb2377782a139..a88c910f8f25d85785ed5c5f03578157fc13b47d 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test030.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test030.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,15 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-unsigned int a = 2;
-unsigned int b = 4;
-unsigned int c = 0;
+  unsigned int a = 2;
+  unsigned int b = 4;
+  unsigned int c = 0;
 
-    c = a|b;
-    if(c == 6){
-        return 6;
-    }
-    return 1;
+  c = a | b;
+  if (c == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test031.c b/hpvm/projects/llvm-cbe/test/cfiles/test031.c
index 69d0dab0e1ff78ff13fce90e3ae68de355b7cf19..6e13a9f03fae2a6b47e6fa6000e7fcea7a74b8d1 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test031.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test031.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,15 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 9;  //1001
-    unsigned int b = 15; //1111
-    unsigned int c = 0;
+  unsigned int a = 9;  // 1001
+  unsigned int b = 15; // 1111
+  unsigned int c = 0;
 
-
-    c = a^b;
-    if(c == 6){
-        return 6;
-    }
-    return 1;
+  c = a ^ b;
+  if (c == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test032.c b/hpvm/projects/llvm-cbe/test/cfiles/test032.c
index ae63e2c4d26864d0bec5dc89dc6a80174e89c985..a98ab650e98bd0e147825bca50dca1bfcaea5809 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test032.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test032.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 3;  //0011
-    unsigned int b = 0;
+  unsigned int a = 3; // 0011
+  unsigned int b = 0;
 
-
-    b = a << 1;  //0110
-    if(b == 6){
-          return 6;
-    }
-    return 1;
+  b = a << 1; // 0110
+  if (b == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test033.c b/hpvm/projects/llvm-cbe/test/cfiles/test033.c
index 1bb96d21bdef67392305cf8631aa7243ab77cb98..81b4177184b79eb5e282cafea5b00c60ce48a5a4 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test033.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test033.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 13;  //1100
-    unsigned int b = 0;
+  unsigned int a = 13; // 1100
+  unsigned int b = 0;
 
-    b = a >> 1;  //0110
-    if(b == 6){
-          return 6;
-    }
-    return 1;
+  b = a >> 1; // 0110
+  if (b == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test034.c b/hpvm/projects/llvm-cbe/test/cfiles/test034.c
index dd9106b0be38c88c2a9a10a5c937c2c373ed5eed..977bf40358d94bb0bb10a66938b4f21b500aab57 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test034.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test034.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 3;
-    int b = 3;
- 
-    a+=b;
-    if(a == 6){
-          return 6;
-    }
-    return 1;
+  int a = 3;
+  int b = 3;
+
+  a += b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test035.c b/hpvm/projects/llvm-cbe/test/cfiles/test035.c
index d1c0ae391f15a0a8b3a118b2435667329ca86a85..8a7f23e17b1bc6f7569cc3352d6898ea033e40a5 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test035.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test035.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 9;
-    int b = 3;
+  int a = 9;
+  int b = 3;
 
-    a-=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a -= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test036.c b/hpvm/projects/llvm-cbe/test/cfiles/test036.c
index d8d5a1957c84d7afa5cc9d96a6afd64297172f21..019722660fb4641ddfbe13d3d5fcec7aa06102c6 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test036.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test036.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,15 +12,14 @@
 // Compound Multiplication Assignment(a*=b) operator.
 // *TW
 //===------------------------------------------------------------------------===//
-int main(){
+int main() {
 
-    int a = 2;
-    int b = 3;
+  int a = 2;
+  int b = 3;
 
-    a*=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a *= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test037.c b/hpvm/projects/llvm-cbe/test/cfiles/test037.c
index 5bf5ee705a44ec929a70d0176690733bcc1fbcb8..2363c91ce91768e7dee329eec5db68beedf8076f 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test037.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test037.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 30;
-    int b = 5;
+  int a = 30;
+  int b = 5;
 
-    a/=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a /= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test038.c b/hpvm/projects/llvm-cbe/test/cfiles/test038.c
index efbe23460710f9f7108b18ba5e60ace5168338a6..1d6aa395aac2994f6bca35de131941154650038b 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test038.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test038.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 20;
-    int b = 14;
+  int a = 20;
+  int b = 14;
 
-    a%=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a %= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test039.c b/hpvm/projects/llvm-cbe/test/cfiles/test039.c
index 112d7f69700f6d7bfad21bc58d91506f4d95b68e..53d4fcb9133bdf57967034a7ad02b231088c59ad 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test039.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test039.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 6;  //0110
-    unsigned int b = 15; //1111
+  unsigned int a = 6;  // 0110
+  unsigned int b = 15; // 1111
 
-    a&=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a &= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test040.c b/hpvm/projects/llvm-cbe/test/cfiles/test040.c
index 5285fb73ecf5d3572c9cbf279be94debc6043e85..d174e7e88041eaca97dc2acb13d218c7ea8baba9 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test040.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test040.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 2;
-    unsigned int b = 4;
+  unsigned int a = 2;
+  unsigned int b = 4;
 
-    a|=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a |= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test041.c b/hpvm/projects/llvm-cbe/test/cfiles/test041.c
index f04e682356dff0739053b0e91857805b952c8aec..45f64966d499f07f09a6b3904a8c2cd9fc9c71a0 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test041.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test041.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 9;  //1001
-    unsigned int b = 15; //1111
+  unsigned int a = 9;  // 1001
+  unsigned int b = 15; // 1111
 
-    a^=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a ^= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test042.c b/hpvm/projects/llvm-cbe/test/cfiles/test042.c
index 5b4f12d80882f347efbe1ae59103f7eaf672c464..ec2547370b90902fffb0e61564bb02c782e20fd9 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test042.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test042.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 3;  //0011
+  unsigned int a = 3; // 0011
 
-    a <<= 1;  //0110
-    if( a == 6){
-        return 6;
-    }
-    return 1;
+  a <<= 1; // 0110
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test043.c b/hpvm/projects/llvm-cbe/test/cfiles/test043.c
index 3b42179304a1741c268d8adf90192bafc5a2ba98..6aeb7bd17c9f40b6f86cdf1fd8ea2dbe520ce554 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test043.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test043.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 13;  //1100
+  unsigned int a = 13; // 1100
 
-    a >>= 1;  //0110
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a >>= 1; // 0110
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test044.c b/hpvm/projects/llvm-cbe/test/cfiles/test044.c
index dbb9d31ad940421d1d33d21af916eccd672ac2a8..f9b7c2d4632326b81ca526a82765152039269fba 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test044.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test044.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,17 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a char.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// char. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    char a = 'A' ; //65
-    int ia = 0;
+  char a = 'A'; // 65
+  int ia = 0;
 
-    ia = a;
-    ia-=59;
+  ia = a;
+  ia -= 59;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test045.c b/hpvm/projects/llvm-cbe/test/cfiles/test045.c
index 50aaa8effcd3994d1dd47213d25748b1293f49f0..c8b57993a7edcbc691c641b1405f6f3ae137b65e 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test045.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test045.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed char a = 'A';
-    int ia = 0;
+  signed char a = 'A';
+  int ia = 0;
 
-    ia = a;
-    ia-=59;
+  ia = a;
+  ia -= 59;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test046.c b/hpvm/projects/llvm-cbe/test/cfiles/test046.c
index ea57085caf034bdaf554169c439d21cabdfc1606..edbfe837fe615cea8ce58d5e7732da49632cf66c 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test046.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test046.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned char a = 'A';
-    int ia = 0;
+  unsigned char a = 'A';
+  int ia = 0;
 
-    ia = a;
-    ia-=59;
+  ia = a;
+  ia -= 59;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test047.c b/hpvm/projects/llvm-cbe/test/cfiles/test047.c
index 2b90d14c7f9b195cdb94611925644cd4debb99ea..476cea234f53c18c684c9069c0675fc2effe48d3 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test047.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test047.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +8,12 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning an int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning an
+// int. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-	int a = 6;
-        return a;
+  int a = 6;
+  return a;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test048.c b/hpvm/projects/llvm-cbe/test/cfiles/test048.c
index c30694ff502502de722f99e2f8a21cfe79ddf17c..ee3966afccc9d7e3f0aaff62aec16142e28a601e 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test048.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test048.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +13,11 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    short int a = 6;
-    int ia = 0;
-    ia = (int)a;
+  short int a = 6;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test049.c b/hpvm/projects/llvm-cbe/test/cfiles/test049.c
index bb4a0801981e734a36518cf406fc8edd2213d0cd..5f29feffc05704adf39078d49177ab5edb5cffcf 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test049.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test049.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +13,11 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long int a = 6;
-    int ia = 0;
-    ia = (int)a;
+  long int a = 6;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test050.c b/hpvm/projects/llvm-cbe/test/cfiles/test050.c
index f69c7cee23cbc47535ce653f28a26305195541e4..aa49757a320855970290cdef405a495052405ffe 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test050.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test050.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +13,11 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed int a = 6;
-    int ia = 0;
-    ia = (int)a;
+  signed int a = 6;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test051.c b/hpvm/projects/llvm-cbe/test/cfiles/test051.c
index 61f1e03d57d03c6a298ed50880d074e4f58e9e9a..0334eafdf30b2be0c6cde7dfeadaaf074d943608 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test051.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test051.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,12 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 6;
+  unsigned int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test052.c b/hpvm/projects/llvm-cbe/test/cfiles/test052.c
index 48e1ce67f8edf77d03d1a9c75d72352b26f06511..3230b192b7b70080bb12318c983b86dcc4b3159b 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test052.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test052.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a float.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// float. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    float a = 6.0;
+  float a = 6.0;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test053.c b/hpvm/projects/llvm-cbe/test/cfiles/test053.c
index 86dd5691a77f96fcd6e8568d22e93ef3a160872b..4ea19186428065a3813addb8537d5331c2709015 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test053.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test053.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a double.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// double. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    double a = 6.0;
+  double a = 6.0;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test054.c b/hpvm/projects/llvm-cbe/test/cfiles/test054.c
index 4c86601412f5db9c6ec818f6029374dacaeebb60..caa7d00080554531f087c950da49975854b2d4aa 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test054.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test054.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long double.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a long
+// double. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long double a = 6.0;
+  long double a = 6.0;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test055.c b/hpvm/projects/llvm-cbe/test/cfiles/test055.c
index cd7891acfe29906d1c0f6b6e9462ba2a95d8e747..4b85082d3353dea43862a1a0db736f1c43bf91fd 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test055.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test055.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a short.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// short. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    short a = 6;
+  short a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test056.c b/hpvm/projects/llvm-cbe/test/cfiles/test056.c
index b12df1df990921aa6586cb6c4328733098c89386..305f044be1a342cd921f5bdc9ab22d938e172103 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test056.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test056.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed short.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed short. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed short a = 6;
+  signed short a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test057.c b/hpvm/projects/llvm-cbe/test/cfiles/test057.c
index 50678081ec9ba22cc6274de0c6be137f913704bb..280ec876bda3a5ffb004d0eb14afa47250253c7d 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test057.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test057.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning an unsigned short.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning an
+// unsigned short. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned short a = 6;
+  unsigned short a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test058.c b/hpvm/projects/llvm-cbe/test/cfiles/test058.c
index cdbfac068fe5aa9639691dcf869d016213e7dff7..f5404bd8336012b80e9f3d02074b4bd390924a84 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test058.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test058.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed short int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed short int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed short int a = 6;
+  signed short int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test059.c b/hpvm/projects/llvm-cbe/test/cfiles/test059.c
index 4de964a13ec97e47ff09e618ac6e9c232d9acf35..13b3ac08797e64625e89a9404cd35a0c27d21203 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test059.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test059.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a unsigned short int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// unsigned short int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned short int a = 6;
+  unsigned short int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test060.c b/hpvm/projects/llvm-cbe/test/cfiles/test060.c
index a0a6e16949f5730787137cbdf0bf5284ae6d292a..ecb393f2f368e5137d164a2996837db204c2f9f4 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test060.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test060.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long a = 6;
+  long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test061.c b/hpvm/projects/llvm-cbe/test/cfiles/test061.c
index d1bf812aa0c3312a6b0dfafd2e59866ec5bdc236..ac7cadd45fe6e5148c41f38dee679ee8bddad2e3 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test061.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test061.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long a = 6;
+  signed long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test062.c b/hpvm/projects/llvm-cbe/test/cfiles/test062.c
index 077ace8b321d7c9bd6a865bf0b2adb9bf892a3be..eaaf59853f711197b7049a993c48b939cbfab608 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test062.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test062.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a unsigned long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// unsigned long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long a = 6;
+  unsigned long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test063.c b/hpvm/projects/llvm-cbe/test/cfiles/test063.c
index 78fbe390f5e05fbbf35f149bf6dc3f56ecd69549..fa6cd18e88bef646c55391a68564355302e55775 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test063.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test063.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed long int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed long int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long int a = 6;
+  signed long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test064.c b/hpvm/projects/llvm-cbe/test/cfiles/test064.c
index c26a3da001557d18686ca20ec4de52bdd8e5e765..05a72b4b9a937ed87262c6ad40a9a24238b201dd 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test064.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test064.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +13,12 @@
 // *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long int a = 6;
+  unsigned long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test065.c b/hpvm/projects/llvm-cbe/test/cfiles/test065.c
index d9b299752c54e9238c6e2171342bb6f1c470163b..76958db4c2fe457f52cccac60f8dd3f49f8a868d 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test065.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test065.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a long
+// long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long long a = 6;
+  long long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test066.c b/hpvm/projects/llvm-cbe/test/cfiles/test066.c
index b4adc62240751fac572c9c1cede33279f51c7c90..10ec61f56ec72432bc43d8ae8af85226cd3f08e8 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test066.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test066.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long long int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a long
+// long int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long long int a = 6;
+  long long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test067.c b/hpvm/projects/llvm-cbe/test/cfiles/test067.c
index 9d786b521063454dc42ba609fc742091ba3df1bb..e90cc8caea23b2baec248752eb84fe3c9afd3479 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test067.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test067.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed long long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed long long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long long  a = 6;
+  signed long long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test068.c b/hpvm/projects/llvm-cbe/test/cfiles/test068.c
index 1f72ecd1b7fa39c845c159ad3b9d86ce44d72547..5c0daa8a157d2ada06f4a1a4f2c66e1f3ac35354 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test068.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test068.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a unsigned long long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// unsigned long long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long long  a = 6;
+  unsigned long long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test069.c b/hpvm/projects/llvm-cbe/test/cfiles/test069.c
index bc611f13c1552f42c38cbf054134fe8fc6f37e24..6cae210ec65e8c1fe9c3827e49ed1147cfe42d22 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test069.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test069.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +13,12 @@
 // *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long long int a = 6;
+  signed long long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test070.c b/hpvm/projects/llvm-cbe/test/cfiles/test070.c
index 94c42bd8b5b4afee99d0cf18bec1806ceead963e..e9b55e232f54c9ac5ba6eea1e10a3babb88fb791 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test070.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test070.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning an unsigned long long int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning an
+// unsigned long long int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long long int a = 6;
+  unsigned long long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test071.c b/hpvm/projects/llvm-cbe/test/cfiles/test071.c
index 3e090147c7e09ed0ce208e305659083a17a31f81..357bc1e53330345808b5cf966bd9bbd4827f89af 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test071.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test071.c
@@ -8,13 +8,13 @@
 //===----------------------------------------------------------------------===//
 //
 // This code tests to see that the CBE will execute an if statement correctly.
-// *TW 
+// *TW
 //
 //===----------------------------------------------------------------------===//
 
 int main() {
-   int x = 6;
-   if (x == 6)
-      return x;
-	return 0;
+  int x = 6;
+  if (x == 6)
+    return x;
+  return 0;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test072.c b/hpvm/projects/llvm-cbe/test/cfiles/test072.c
index 7c7cbcb391bb51a53e20bfae8aabb23cf0bc2ef6..87cbd91bb0591e452c9ada49c94968ab63c84c64 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test072.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test072.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,18 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute an else-if statement correctly.
-// *TW
+// This code tests to see that the CBE will execute an else-if statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x = 6;
-   if (x == 4) {
-      return 2;
-   } else if (x == 6){
-        return 6;
-     } else {
-          return 8;
-     }
+  int x = 6;
+  if (x == 4) {
+    return 2;
+  } else if (x == 6) {
+    return 6;
+  } else {
+    return 8;
+  }
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test073.c b/hpvm/projects/llvm-cbe/test/cfiles/test073.c
index 006a7348e87c6259a41227f731542cdfe1f931d2..2e664c4c73bfe827019ff0cc3aae9e9f4037155d 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test073.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test073.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a do-while statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a do-while statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x = 0;
-   do {
-      x++;
-   } while (x < 6);
+  int x = 0;
+  do {
+    x++;
+  } while (x < 6);
 
-   return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test074.c b/hpvm/projects/llvm-cbe/test/cfiles/test074.c
index bb3ff37858bdc25554481a48810c597f7b2f176e..903af81861c08822d9b9c9078bd0cd14d977f612 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test074.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test074.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,18 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a break/continue statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a break/continue statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x;
-   for (x=0; x<=25; x++) {
-      if (x == 6)
-         break;
-      if (x < 15)
-         continue;
-   }
-   return x;
+  int x;
+  for (x = 0; x <= 25; x++) {
+    if (x == 6)
+      break;
+    if (x < 15)
+      continue;
+  }
+  return x;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test075.c b/hpvm/projects/llvm-cbe/test/cfiles/test075.c
index a0601622c2f897e513615bcbd9b0a91176a26a5b..55562b99efb1414e10e139188919ee1c03e9133e 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test075.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test075.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +8,21 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a Goto-Label statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a Goto-Label statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x = 0;
-   goto label;
-   
-   for(;;) {
-      x = 10;
-      return x;
-   }
+  int x = 0;
+  goto label;
 
-   label:
-   x = 6;
-   return x;
+  for (;;) {
+    x = 10;
+    return x;
+  }
 
+label:
+  x = 6;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test076.c b/hpvm/projects/llvm-cbe/test/cfiles/test076.c
index d5f149eb3b51471ce23d8b9baa3186d58e509b44..faf56a3e37e14594fbfbfc3894e504989fcfdea1 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test076.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test076.c
@@ -15,8 +15,8 @@
 
 int main() {
 
-   int x = 6, y = 0, *ip = 0;
-   ip = &x;
-   y = *ip;
-   return y;
+  int x = 6, y = 0, *ip = 0;
+  ip = &x;
+  y = *ip;
+  return y;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test077.c b/hpvm/projects/llvm-cbe/test/cfiles/test077.c
index a6e1fc7985b1bb5c9d1ac6943533d5c523a7b18d..771463d5afe5dafd1b3975697d807ab3767b3915 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test077.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test077.c
@@ -14,11 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-   char x = 'a', y = 'b', *cp;
-   cp = &x;
-   y = *cp;
-   if (y == 'a'){
-      return 6;
-   }
-   return 1;
+  char x = 'a', y = 'b', *cp;
+  cp = &x;
+  y = *cp;
+  if (y == 'a') {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test078.c b/hpvm/projects/llvm-cbe/test/cfiles/test078.c
index cc60c18e34b3ecf4a18401fe604d1b45d1f8d1b1..f511a93fd34e3f62ea2d9a515cc6dd70449dc0b3 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test078.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test078.c
@@ -15,9 +15,9 @@
 #include <stddef.h>
 
 int main() {
-   int *ptr = NULL;
-    if (ptr == 0){
-        return 6;
-    }
-    return 1;
+  int *ptr = NULL;
+  if (ptr == 0) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test079.c b/hpvm/projects/llvm-cbe/test/cfiles/test079.c
index fd1ea110398c435cb9276fd064f6190aba0b5470..12b3477e32ae06f3be29a89b0fc18bac338a0b57 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test079.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test079.c
@@ -14,12 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-   double x = 6, y = 0, *dp;
-   dp = &x;
-   y = *dp;
-   if (y == 6){
-      return 6;
-   }
-   return 1;
+  double x = 6, y = 0, *dp;
+  dp = &x;
+  y = *dp;
+  if (y == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test080.c b/hpvm/projects/llvm-cbe/test/cfiles/test080.c
index b7fb855bf45dc87a03bd9ca24a785516250ead4b..9b42fab5d93a392064b8a22bf97d325cd0ae24cc 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test080.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test080.c
@@ -14,11 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-   float x = 6, y = 0, *fp;
-   fp = &x;
-   y = *fp;
-   if (y == 6){
-      return 6;
-   }
-   return 1;
+  float x = 6, y = 0, *fp;
+  fp = &x;
+  y = *fp;
+  if (y == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test081.c b/hpvm/projects/llvm-cbe/test/cfiles/test081.c
index 6efcad46eadbb23dcc85b7f5aefa43855383bbe9..e032f57f519165cb7a35578a68babe127edea66f 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test081.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test081.c
@@ -7,17 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly use the address-of value (&)
-// variable and and return the value-at address (*) variable from integer 'num'.
-// *TW
+// This code tests to see that the CBE will properly use the address-of value
+// (&) variable and and return the value-at address (*) variable from integer
+// 'num'. *TW
 //
 //===----------------------------------------------------------------------===//
 
-int main(){
-   int *ptr;
-   int num = 6;
-   ptr = &num;
-   int deref = *ptr;
-   return deref;
-
+int main() {
+  int *ptr;
+  int num = 6;
+  ptr = &num;
+  int deref = *ptr;
+  return deref;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test082.c b/hpvm/projects/llvm-cbe/test/cfiles/test082.c
index e30bb7a2f192adc03fd646bc625340c847cfe92c..7a7bf109eb1dc2b29ab1e66c4b1b804df6e586e3 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test082.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test082.c
@@ -13,13 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-struct Number{
-   int price;
+struct Number {
+  int price;
 };
 
-int main(){
-   struct Number a;
-   struct Number* ptr = &a;
-   ptr->price = 6;
-   return ptr->price;
+int main() {
+  struct Number a;
+  struct Number *ptr = &a;
+  ptr->price = 6;
+  return ptr->price;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test083.c b/hpvm/projects/llvm-cbe/test/cfiles/test083.c
index 5dc920edf485b69c20b78886a4eb2229af9151ad..58eb4c14a3dcd37623a1bd972705ac0e4cd46703 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test083.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test083.c
@@ -13,12 +13,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-int main(){
-   int *ip;
-   int a[2];
-   a[0] = 1;
-   a[1] = 6;
-   ip = &a[1];
+int main() {
+  int *ip;
+  int a[2];
+  a[0] = 1;
+  a[1] = 6;
+  ip = &a[1];
 
-   return *ip;
+  return *ip;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test084.c b/hpvm/projects/llvm-cbe/test/cfiles/test084.c
index 6f5b3ad6d9cc526c609719308fa1da9a8ab6ab47..3a67fc1ef9cef5b1340e68eb41d93500000c5a26 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test084.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test084.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly increment a pointer via int.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly increment a pointer via
+// int. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //
 //===---------------------------------------------------------------------------===//
 
@@ -20,9 +21,9 @@ int main() {
   intptr_t inc0 = 0, inc1 = 0, diff = 0, a = 100;
   intptr_t *p = &a;
   inc0 = (intptr_t)p;
-  ++(*p++);  //++(*p++);
+  ++(*p++); //++(*p++);
   inc1 = (intptr_t)p;
-  diff =  inc1-inc0;
+  diff = inc1 - inc0;
   diff += 2;
   return diff;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test085.c b/hpvm/projects/llvm-cbe/test/cfiles/test085.c
index 01e8d65e6cbb83bc21b46ff9c493284f8e41d2cd..04c47b83d6bce8e9cb8dba4deff3171ece95b46c 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test085.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test085.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly decrement a pointer via int.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly decrement a pointer via
+// int. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //
 //===---------------------------------------------------------------------------===//
 
@@ -22,8 +23,7 @@ int main() {
   inc0 = (intptr_t)p;
   --(*p--); //--(*p--);
   inc1 = (intptr_t)p;
-  diff =  inc0-inc1;
+  diff = inc0 - inc1;
   diff += 2;
   return diff;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test086.c b/hpvm/projects/llvm-cbe/test/cfiles/test086.c
index 72e7f03901df7570e5e134c4707c20e8fada74a5..32e33e992378733e721082dc94c339b64bc1cd81 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test086.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test086.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly increment a pointer via char.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly increment a pointer via
+// char. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //
 //===---------------------------------------------------------------------------===//
 
@@ -24,5 +25,5 @@ int main() {
   // diff =  inc1-inc0;
   // diff += 2;
   // return diff;
-  return 6; //TODO
+  return 6; // TODO
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test087.c b/hpvm/projects/llvm-cbe/test/cfiles/test087.c
index 29291167906a5cb9fd3aedaa0d3523eaa54d5bbd..6c983a65d62b9a71c9c2be11a8107e734628f999 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test087.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test087.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly decrement a pointer via char.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly decrement a pointer via
+// char. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //===---------------------------------------------------------------------------===//
 
 int main() {
@@ -23,5 +24,5 @@ int main() {
   // diff =  inc0-inc1;
   // diff += 2;
   // return diff;
-  return 6; //TODO
+  return 6; // TODO
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test088.c b/hpvm/projects/llvm-cbe/test/cfiles/test088.c
index 938237bea9774b7c9e52b36ae20d814bb563507c..7cefca1537290d57c2adce83321b848ca82fcbe3 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test088.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test088.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===---------------------------------------------------------------------------===//
 
-int main(){
-   int a[2][2];
-   int *ip;
-   a[0][0] = 0;
-   a[0][1] = 1;
-   a[1][0] = 3;
-   a[1][1] = 6;
-   ip = &a[1][1];
+int main() {
+  int a[2][2];
+  int *ip;
+  a[0][0] = 0;
+  a[0][1] = 1;
+  a[1][0] = 3;
+  a[1][1] = 6;
+  ip = &a[1][1];
 
-   return *ip;
+  return *ip;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test089.c b/hpvm/projects/llvm-cbe/test/cfiles/test089.c
index 925c3bb56ba77bb395641197dd4b5cef231d369e..59b20d5b45ba6c4d4d0cd70e04d0fd99d0253964 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test089.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test089.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,20 +8,20 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute data-packing in a structure correctly.
-// *TW
+// This code tests to see that the CBE will execute data-packing in a structure
+// correctly. *TW
 //===------------------------------------------------------------------------------===//
 
 #pragma pack(push)
 #pragma pack(1)
 
-struct DataSize{
-    char Data2;
-    char Data3;
-    int Data1;
+struct DataSize {
+  char Data2;
+  char Data3;
+  int Data1;
 };
 
-int main(){
-    struct DataSize example;
-    return sizeof(example);
+int main() {
+  struct DataSize example;
+  return sizeof(example);
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test090.c b/hpvm/projects/llvm-cbe/test/cfiles/test090.c
index 021a05e8a002bcf2320df59c7e39c2963e52c756..d3e64ff5b9b21a68147c0a0aab69d74d05fc93e4 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test090.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test090.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,19 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a union and check the data size correctly.
-// *TW
+// This code tests to see that the CBE will execute a union and check the data
+// size correctly. *TW
 //===------------------------------------------------------------------------------===//
 
-union Data{
-   int i;
-   float f;
-   char  str[8];
+union Data {
+  int i;
+  float f;
+  char str[8];
 };
 
-int main(){
-   union Data data;
-   int datasize = sizeof(data) - 2;
+int main() {
+  union Data data;
+  int datasize = sizeof(data) - 2;
 
-   return datasize;
+  return datasize;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test091.c b/hpvm/projects/llvm-cbe/test/cfiles/test091.c
index dce59d85d5b788696deb7e0b4e0a97e69cdea0e8..557286e1ddd2326f912d4ae788218fce536a0a25 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test091.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test091.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,17 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will access and return union members correctly.
-// *TW
+// This code tests to see that the CBE will access and return union members
+// correctly. *TW
 //===------------------------------------------------------------------------------===//
 
-union Data{
-   char unit1[6];
-   char unit2;
-   char unit3;
+union Data {
+  char unit1[6];
+  char unit2;
+  char unit3;
 };
 
-int main(){
-   union Data data;
-   return sizeof(data);
+int main() {
+  union Data data;
+  return sizeof(data);
 }
-
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test092.c b/hpvm/projects/llvm-cbe/test/cfiles/test092.c
index 3b197f21a5f8964daf0ac427955df96faa9feec2..8018bca7eecd1b4196f822bc354108e6b5e8dc27 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test092.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test092.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,29 +8,25 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will pass a structure into a function correctly.
-// *TW
+// This code tests to see that the CBE will pass a structure into a function
+// correctly. *TW
 //===------------------------------------------------------------------------------===//
 
 int k = 0;
 
-struct test{
-   int i;
-   float f;
+struct test {
+  int i;
+  float f;
 };
 
-void funct(struct test example){
-   k = example.i;
-}
+void funct(struct test example) { k = example.i; }
 
-int main(){
-   struct test example;
+int main() {
+  struct test example;
 
-   example.i = 6;
-   example.f = 6.0;
-   funct(example);
+  example.i = 6;
+  example.f = 6.0;
+  funct(example);
 
-   return k;
+  return k;
 }
-
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test093.c b/hpvm/projects/llvm-cbe/test/cfiles/test093.c
index 3553edea3a5fdb8680feaf2297ab32938ca2c608..9a6188e7d4d13b8e5b73a2e0cf832cb3ddb0f0ba 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test093.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test093.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,19 +12,19 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-struct layer1{
-   int depth1;
-   char name1[20];
+struct layer1 {
+  int depth1;
+  char name1[20];
 };
 
-struct layer2{
-   int depth2;
-   char name2[20];
-   struct layer1 layer_data;
-}layer2_data;
+struct layer2 {
+  int depth2;
+  char name2[20];
+  struct layer1 layer_data;
+} layer2_data;
 
-int main(){
-   struct layer2 layer2_data = {1, "test", {6, "test2"}};
+int main() {
+  struct layer2 layer2_data = {1, "test", {6, "test2"}};
 
-   return layer2_data.layer_data.depth1;
+  return layer2_data.layer_data.depth1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test094.c b/hpvm/projects/llvm-cbe/test/cfiles/test094.c
index 2568c9c3537d9cedce0cb36e86a414c068493504..8faf3330cc9f360debb2434f2720cfead79be20e 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test094.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test094.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +13,17 @@
 //===------------------------------------------------------------------------------===//
 
 typedef struct test {
- int var1;
- int var2;
- int var3;
-}testrename;
+  int var1;
+  int var2;
+  int var3;
+} testrename;
 
-int main(){
-    testrename variable;
+int main() {
+  testrename variable;
 
-    variable.var2 = 5;
-    variable.var3 = 6;
-    variable.var1 = 9;
+  variable.var2 = 5;
+  variable.var3 = 6;
+  variable.var1 = 9;
 
-    return variable.var3;
+  return variable.var3;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test095.c b/hpvm/projects/llvm-cbe/test/cfiles/test095.c
index 21db27203416db2f9454ce203eed555299465a40..b622c4b94c071548e734f4dd4ceec7097b5b90a2 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test095.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test095.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +12,16 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-struct Shows
-    {
-     char show[20];
-     int runlength;
-     int rating;
+struct Shows {
+  char show[20];
+  int runlength;
+  int rating;
 };
 
-int main(){
-struct Shows b1[3] = {
-        {"Big Bang Theory",22,6},
-        {"NCIS",45,9},
-    };
-    return b1[0].rating;
+int main() {
+  struct Shows b1[3] = {
+      {"Big Bang Theory", 22, 6},
+      {"NCIS", 45, 9},
+  };
+  return b1[0].rating;
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test096.c b/hpvm/projects/llvm-cbe/test/cfiles/test096.c
index 81661df1212b75da06f9eabcc9e64c82118172ad..35982e134131b895bcf15cbb22bda76e482d1e0b 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test096.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test096.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,18 +8,18 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a self referencing structure.
-// *TW
+// This code tests to see that the CBE will execute a self referencing
+// structure. *TW
 //===------------------------------------------------------------------------------===//
 #include <stdio.h> //for NULL
 
-struct data{
-   int a;
-   struct data *ptr;
+struct data {
+  int a;
+  struct data *ptr;
 };
 
-int main(){
-   struct data p=(struct data){.a=3,.ptr=&(struct data){.a=6,.ptr=NULL}};
-   return p.ptr->a;
+int main() {
+  struct data p =
+      (struct data){.a = 3, .ptr = &(struct data){.a = 6, .ptr = NULL}};
+  return p.ptr->a;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test097.c b/hpvm/projects/llvm-cbe/test/cfiles/test097.c
index a42e36b6cb43551113d7c38984895a07a479ffc4..6e0f8145b0909b0c6c6b3f26e09633b8ccc58b12 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test097.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test097.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +12,16 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int addby2 ( int x );
+int addby2(int x);
 
-int main( ){
-    int n ;
-    n = addby2 ( 4 ) ;
-    return n;
+int main() {
+  int n;
+  n = addby2(4);
+  return n;
 }
 
-int addby2(int x){
-    int p ;
-    p = x + 2 ;
-    return ( p ) ;
+int addby2(int x) {
+  int p;
+  p = x + 2;
+  return (p);
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test098.c b/hpvm/projects/llvm-cbe/test/cfiles/test098.c
index 70de117e51a9064e638354fe78072e93a635c904..d8594b5a7615b6be6fcc0cb7a04b9e5ff972acd3 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test098.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test098.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,18 +12,18 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int subtrby2 ( int x );
+int subtrby2(int x);
 static int eight = 8;
 static int two = 2;
 
-int main( ){
-    int n ;
-    n = subtrby2 ( eight ) ;
-    return n;
+int main() {
+  int n;
+  n = subtrby2(eight);
+  return n;
 }
 
-int subtrby2(int x){
-    int p ;
-    p = x - two ;
-    return ( p ) ;
+int subtrby2(int x) {
+  int p;
+  p = x - two;
+  return (p);
 }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test099.c b/hpvm/projects/llvm-cbe/test/cfiles/test099.c
index 1c4713262eeaf6042f7af0ee7e6e41547f226bb2..c4ab77522b27cdf29251409e707bb6548891e9a7 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test099.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test099.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +13,8 @@
 //===------------------------------------------------------------------------------===//
 
 int main() {
-    register int counter = 0;
-    counter += 6;
+  register int counter = 0;
+  counter += 6;
 
-    return 6;
+  return 6;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test100.c b/hpvm/projects/llvm-cbe/test/cfiles/test100.c
index db2cd9ea604e3a5aa1b64eaf4159ae9f1fe2700c..2b6a07912d94388827c9cde38e997ca96249b269 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test100.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test100.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,20 +12,19 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int fibonaci(int i){
-   if(i == 0){
-      return 0;
-   }
-   if(i == 1){
-      return 1;
-   }
-   return fibonaci(i-1) + fibonaci(i-2);
+int fibonaci(int i) {
+  if (i == 0) {
+    return 0;
+  }
+  if (i == 1) {
+    return 1;
+  }
+  return fibonaci(i - 1) + fibonaci(i - 2);
 }
 
-int  main(){
-    int returnval;
-    returnval = fibonaci(6) - 2;
+int main() {
+  int returnval;
+  returnval = fibonaci(6) - 2;
 
-    return returnval;
+  return returnval;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test101.c b/hpvm/projects/llvm-cbe/test/cfiles/test101.c
index 50d18d3ec33746d58a24cf342247e717d926d31a..ffffeb592072391026a4b0c3a705e8c63db235fd 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test101.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test101.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,24 +15,26 @@
 
 unsigned int fastfib(unsigned int n);
 
-int main(){
-    return fastfib(6) - 2;
-}
+int main() { return fastfib(6) - 2; }
 
-unsigned int fastfib(unsigned int n){
-    unsigned int a[3];
-    unsigned int *p=a;
-    unsigned int i;
+unsigned int fastfib(unsigned int n) {
+  unsigned int a[3];
+  unsigned int *p = a;
+  unsigned int i;
 
-    for(i=0; i<=n; ++i) {
-        if(i<2) *p=i;
-        else{
-            if(p==a) *p=*(a+1)+*(a+2);
-            else if(p==a+1) *p=*a+*(a+2);
-            else *p=*a+*(a+1);
-        }
-        if(++p>a+2) p=a;
+  for (i = 0; i <= n; ++i) {
+    if (i < 2)
+      *p = i;
+    else {
+      if (p == a)
+        *p = *(a + 1) + *(a + 2);
+      else if (p == a + 1)
+        *p = *a + *(a + 2);
+      else
+        *p = *a + *(a + 1);
     }
-    return p==a?*(p+2):*(p-1);
+    if (++p > a + 2)
+      p = a;
+  }
+  return p == a ? *(p + 2) : *(p - 1);
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test102.c b/hpvm/projects/llvm-cbe/test/cfiles/test102.c
index 572ea0310334592c668e6266da7c364d39a80ebb..44247c6231a26acfca041e6896bcbb300d2bc6f5 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test102.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test102.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test103.c b/hpvm/projects/llvm-cbe/test/cfiles/test103.c
index 6e2329021d257f46fb2b818e68f932e90899b8d8..e751c2d8a4e3c2249921b15c833ae0e99a47d10a 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test103.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test103.c
@@ -15,9 +15,8 @@
 #define B 3
 #define C A + B
 
-int main(){
+int main() {
 
-   int x = C;
-   return x;
+  int x = C;
+  return x;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test104.c b/hpvm/projects/llvm-cbe/test/cfiles/test104.c
index 88884d68575f413784f039a1685430c8e1dce56e..43c29dedb685484fd779d0565eaf3d30f97c160a 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test104.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test104.c
@@ -12,13 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-int tail (int n) {
+int tail(int n) {
   if (n == 6)
     return n;
   else
-    return tail(n+1);
+    return tail(n + 1);
 }
 
-int main(){
-  return tail(0);
-}
+int main() { return tail(0); }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/test105.c b/hpvm/projects/llvm-cbe/test/cfiles/test105.c
index 7e830d55c55182e5d995a8841c41132555c54ee4..79ab340aef5c7db27c06d076efa95bb85fb5a964 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/test105.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/test105.c
@@ -12,13 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-int head(int n){
-  if(n == 6)
+int head(int n) {
+  if (n == 6)
     return n;
   else
-    return head(n+1);
+    return head(n + 1);
 }
 
-int main(){
-  return head(0);
-}
+int main() { return head(0); }
diff --git a/hpvm/projects/llvm-cbe/test/cfiles/testbad.c b/hpvm/projects/llvm-cbe/test/cfiles/testbad.c
index a7456dc2b52888358ae2e7fce0da5b0c799c9b45..a8a9bca17c49e5ebed010beeeb987ac5904b3b10 100644
--- a/hpvm/projects/llvm-cbe/test/cfiles/testbad.c
+++ b/hpvm/projects/llvm-cbe/test/cfiles/testbad.c
@@ -11,7 +11,4 @@
 //
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    return 25;
-}
+int main() { return 25; }
diff --git a/hpvm/projects/llvm-cbe/test/dtypes.h b/hpvm/projects/llvm-cbe/test/dtypes.h
index 3ab8d8b1c5399d17cfb052d144d8356783574027..00f1b417bfc5c2c5c4a499516346896c1ad21c75 100644
--- a/hpvm/projects/llvm-cbe/test/dtypes.h
+++ b/hpvm/projects/llvm-cbe/test/dtypes.h
@@ -26,22 +26,21 @@
 
 #if !defined(_COMPILER_MINGW_)
 
-#define strtoull                                            _strtoui64
-#define strtoll                                             _strtoi64
-#define strcasecmp                                          _stricmp
-#define strncasecmp                                         _strnicmp
-#define snprintf                                            _snprintf
-#define stat                                                _stat
+#define strtoull _strtoui64
+#define strtoll _strtoi64
+#define strcasecmp _stricmp
+#define strncasecmp _strnicmp
+#define snprintf _snprintf
+#define stat _stat
 
-#define STDIN_FILENO                                        0
-#define STDOUT_FILENO                                       1
-#define STDERR_FILENO                                       2
+#define STDIN_FILENO 0
+#define STDOUT_FILENO 1
+#define STDERR_FILENO 2
 
 #endif /* !_COMPILER_MINGW_ */
 
 #endif /* _OS_WINDOWS_ */
 
-
 /*
   This file defines sane integer types for our target platforms. This
   library only runs on machines with the following characteristics:
@@ -56,86 +55,86 @@
 
 #ifdef _OS_WINDOWS_
 #define STDCALL __stdcall
-# ifdef LIBRARY_EXPORTS
-#  define JL_DLLEXPORT __declspec(dllexport)
-# else
-#  define JL_DLLEXPORT __declspec(dllimport)
-# endif
+#ifdef LIBRARY_EXPORTS
+#define JL_DLLEXPORT __declspec(dllexport)
+#else
+#define JL_DLLEXPORT __declspec(dllimport)
+#endif
 #else
 #define STDCALL
-#define JL_DLLEXPORT __attribute__ ((visibility("default")))
+#define JL_DLLEXPORT __attribute__((visibility("default")))
 #endif
 
 #ifdef _OS_LINUX_
 #include <endian.h>
-#define LITTLE_ENDIAN  __LITTLE_ENDIAN
-#define BIG_ENDIAN     __BIG_ENDIAN
-#define PDP_ENDIAN     __PDP_ENDIAN
-#define BYTE_ORDER     __BYTE_ORDER
+#define LITTLE_ENDIAN __LITTLE_ENDIAN
+#define BIG_ENDIAN __BIG_ENDIAN
+#define PDP_ENDIAN __PDP_ENDIAN
+#define BYTE_ORDER __BYTE_ORDER
 #endif
 
 #if defined(__APPLE__) || defined(__FreeBSD__)
 #include <machine/endian.h>
-#define __LITTLE_ENDIAN  LITTLE_ENDIAN
-#define __BIG_ENDIAN     BIG_ENDIAN
-#define __PDP_ENDIAN     PDP_ENDIAN
-#define __BYTE_ORDER     BYTE_ORDER
+#define __LITTLE_ENDIAN LITTLE_ENDIAN
+#define __BIG_ENDIAN BIG_ENDIAN
+#define __PDP_ENDIAN PDP_ENDIAN
+#define __BYTE_ORDER BYTE_ORDER
 #endif
 
 #ifdef _OS_WINDOWS_
-#define __LITTLE_ENDIAN    1234
-#define __BIG_ENDIAN       4321
-#define __PDP_ENDIAN       3412
-#define __BYTE_ORDER       __LITTLE_ENDIAN
+#define __LITTLE_ENDIAN 1234
+#define __BIG_ENDIAN 4321
+#define __PDP_ENDIAN 3412
+#define __BYTE_ORDER __LITTLE_ENDIAN
 #define __FLOAT_WORD_ORDER __LITTLE_ENDIAN
-#define LITTLE_ENDIAN      __LITTLE_ENDIAN
-#define BIG_ENDIAN         __BIG_ENDIAN
-#define PDP_ENDIAN         __PDP_ENDIAN
-#define BYTE_ORDER         __BYTE_ORDER
+#define LITTLE_ENDIAN __LITTLE_ENDIAN
+#define BIG_ENDIAN __BIG_ENDIAN
+#define PDP_ENDIAN __PDP_ENDIAN
+#define BYTE_ORDER __BYTE_ORDER
 #endif
 
 #define LLT_ALLOC(n) malloc(n)
-#define LLT_REALLOC(p,n) realloc((p),(n))
+#define LLT_REALLOC(p, n) realloc((p), (n))
 #define LLT_FREE(x) free(x)
 
 #if defined(_OS_WINDOWS_) && defined(_COMPILER_INTEL_)
-#  define STATIC_INLINE static
-#  define INLINE
+#define STATIC_INLINE static
+#define INLINE
 #elif defined(_OS_WINDOWS_) && defined(_COMPILER_MICROSOFT_)
-#  define STATIC_INLINE static __inline
-#  define INLINE __inline
+#define STATIC_INLINE static __inline
+#define INLINE __inline
 #else
-#  define STATIC_INLINE static inline
-#  define INLINE inline
+#define STATIC_INLINE static inline
+#define INLINE inline
 #endif
 
 #if defined(_OS_WINDOWS_) && !defined(_COMPILER_MINGW_)
-#  define NOINLINE __declspec(noinline)
-#  define NOINLINE_DECL(f) __declspec(noinline) f
+#define NOINLINE __declspec(noinline)
+#define NOINLINE_DECL(f) __declspec(noinline) f
 #else
-#  define NOINLINE __attribute__((noinline))
-#  define NOINLINE_DECL(f) f __attribute__((noinline))
+#define NOINLINE __attribute__((noinline))
+#define NOINLINE_DECL(f) f __attribute__((noinline))
 #endif
 
 #ifdef _COMPILER_MICROSOFT_
-# ifdef _P64
-#  define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(8)) x
-# else
-#  define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(4)) x
-# endif
+#ifdef _P64
+#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(8)) x
+#else
+#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) __declspec(align(4)) x
+#endif
 #elif defined(__GNUC__)
-#  define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) x __attribute__ ((aligned (sizeof(void*))))
+#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x) x __attribute__((aligned(sizeof(void *))))
 #else
-#  define JL_ATTRIBUTE_ALIGN_PTRSIZE(x)
+#define JL_ATTRIBUTE_ALIGN_PTRSIZE(x)
 #endif
 
 typedef int bool_t;
-typedef unsigned char  byte_t;   /* 1 byte */
+typedef unsigned char byte_t; /* 1 byte */
 
 #ifdef _P64
 #define TOP_BIT 0x8000000000000000
 #define NBITS 64
-typedef uint64_t uint_t;  // preferred int type on platform
+typedef uint64_t uint_t; // preferred int type on platform
 typedef int64_t int_t;
 #else
 #define TOP_BIT 0x80000000
@@ -144,17 +143,16 @@ typedef uint32_t uint_t;
 typedef int32_t int_t;
 #endif
 
-STATIC_INLINE unsigned int next_power_of_two(unsigned int val)
-{
-    /* this function taken from libuv src/unix/core.c */
-    val -= 1;
-    val |= val >> 1;
-    val |= val >> 2;
-    val |= val >> 4;
-    val |= val >> 8;
-    val |= val >> 16;
-    val += 1;
-    return val;
+STATIC_INLINE unsigned int next_power_of_two(unsigned int val) {
+  /* this function taken from libuv src/unix/core.c */
+  val -= 1;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val += 1;
+  return val;
 }
 
 #define LLT_ALIGN(x, sz) (((x) + (sz)-1) & -(sz))
@@ -162,22 +160,22 @@ STATIC_INLINE unsigned int next_power_of_two(unsigned int val)
 // branch prediction annotations
 #ifdef __GNUC__
 #define __unlikely(x) __builtin_expect(!!(x), 0)
-#define __likely(x)   __builtin_expect(!!(x), 1)
+#define __likely(x) __builtin_expect(!!(x), 1)
 #else
 #define __unlikely(x) (x)
-#define __likely(x)   (x)
+#define __likely(x) (x)
 #endif
 
 #define DBL_MAXINT 9007199254740992LL
 #define FLT_MAXINT 16777216
-#define U64_MAX    18446744073709551615ULL
-#define S64_MAX    9223372036854775807LL
-#define S64_MIN    (-S64_MAX - 1LL)
-#define BIT63      0x8000000000000000LL
-#define U32_MAX    4294967295L
-#define S32_MAX    2147483647L
-#define S32_MIN    (-S32_MAX - 1L)
-#define BIT31      0x80000000
+#define U64_MAX 18446744073709551615ULL
+#define S64_MAX 9223372036854775807LL
+#define S64_MIN (-S64_MAX - 1LL)
+#define BIT63 0x8000000000000000LL
+#define U32_MAX 4294967295L
+#define S32_MAX 2147483647L
+#define S32_MIN (-S32_MAX - 1L)
+#define BIT31 0x80000000
 
 #define D_PNAN ((double)+NAN)
 #define D_NNAN ((double)-NAN)
@@ -188,17 +186,27 @@ STATIC_INLINE unsigned int next_power_of_two(unsigned int val)
 #define F_PINF ((float)+INFINITY)
 #define F_NINF ((float)-INFINITY)
 
-typedef enum { T_INT8, T_UINT8, T_INT16, T_UINT16, T_INT32, T_UINT32,
-               T_INT64, T_UINT64, T_FLOAT, T_DOUBLE } numerictype_t;
-
-#define N_NUMTYPES ((int)T_DOUBLE+1)
+typedef enum {
+  T_INT8,
+  T_UINT8,
+  T_INT16,
+  T_UINT16,
+  T_INT32,
+  T_UINT32,
+  T_INT64,
+  T_UINT64,
+  T_FLOAT,
+  T_DOUBLE
+} numerictype_t;
+
+#define N_NUMTYPES ((int)T_DOUBLE + 1)
 
 #ifdef _P64
-# define T_PTRDIFF T_INT64
-# define T_SIZE T_UINT64
+#define T_PTRDIFF T_INT64
+#define T_SIZE T_UINT64
 #else
-# define T_PTRDIFF T_INT32
-# define T_SIZE T_UINT32
+#define T_PTRDIFF T_INT32
+#define T_SIZE T_UINT32
 #endif
 
 #endif /* DTYPES_H */
\ No newline at end of file
diff --git a/hpvm/projects/llvm-cbe/test/platform.h b/hpvm/projects/llvm-cbe/test/platform.h
index 0b7c6bcbbdd700d90adc40f495c9210241fb32be..8db68aae7bceb0506c2f8620a971a2d86fdb1695 100644
--- a/hpvm/projects/llvm-cbe/test/platform.h
+++ b/hpvm/projects/llvm-cbe/test/platform.h
@@ -30,8 +30,8 @@
  */
 
 /*******************************************************************************
-*                               Compiler                                       *
-*******************************************************************************/
+ *                               Compiler *
+ *******************************************************************************/
 
 /*
  * Notes:
@@ -60,8 +60,8 @@
 #endif
 
 /*******************************************************************************
-*                               OS                                             *
-*******************************************************************************/
+ *                               OS *
+ *******************************************************************************/
 
 #if defined(__FreeBSD__)
 #define _OS_FREEBSD_
@@ -74,12 +74,14 @@
 #endif
 
 /*******************************************************************************
-*                               Architecture                                   *
-*******************************************************************************/
+ *                               Architecture *
+ *******************************************************************************/
 
-#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
+#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) ||           \
+    defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
 #define _CPU_X86_64_
-#elif defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(_X86_)
+#elif defined(i386) || defined(__i386) || defined(__i386__) ||                 \
+    defined(_M_IX86) || defined(_X86_)
 #define _CPU_X86_
 #elif defined(__aarch64__)
 #define _CPU_AARCH64_
@@ -92,22 +94,22 @@
 #endif
 
 #if defined(_CPU_X86_64_)
-#  define _P64
+#define _P64
 #elif defined(_CPU_X86_)
-#  define _P32
+#define _P32
 #elif defined(_OS_WINDOWS_)
 /* Not sure how to determine pointer size on Windows running ARM. */
-#  if _WIN64
-#    define _P64
-#  else
-#    define _P32
-#  endif
+#if _WIN64
+#define _P64
+#else
+#define _P32
+#endif
 #elif __SIZEOF_POINTER__ == 8
-#    define _P64
+#define _P64
 #elif __SIZEOF_POINTER__ == 4
-#    define _P32
+#define _P32
 #else
-#  error pointer size not known for your platform / compiler
+#error pointer size not known for your platform / compiler
 #endif
 
 #endif /* !PLATFORM_H */
\ No newline at end of file
diff --git a/hpvm/projects/llvm-cbe/test/selectionsort/main.c b/hpvm/projects/llvm-cbe/test/selectionsort/main.c
index 47cf877d34daea4d6ea2cb45fec6053e04cfbec5..65848e6adab8d6eb1e6b75ece1b720849d510623 100644
--- a/hpvm/projects/llvm-cbe/test/selectionsort/main.c
+++ b/hpvm/projects/llvm-cbe/test/selectionsort/main.c
@@ -1,39 +1,34 @@
 #include <stdio.h>
 
-int main()
-{
-	int array[100], n, c, d, position, swap;
-	
-	printf("Enter number of elements\n");
-	scanf("%d", &n);
-	
-	printf("Enter %d integers\n", n);
-	
-	for (c = 0; c < n; c++)
-		scanf("%d", &array[c]);
-		
-	for (c = 0; c < (n - 1); c++)
-	{
-		position = c;
-		
-		for (d = c +1; d < n; d++)
-		{
-			if (array[position] > array[d])
-				position = d;
-		}
-		if (position != c)
-		{
-			swap = array[c];
-			array[c] = array[position];
-			array[position] = swap;
-		}
-	}
-	
-	printf("Sorted list in ascending order:\n");
-	
-	for (c = 0; c < n; c++)
-		printf("%d\n", array[c]);
-	
-	return 0;
+int main() {
+  int array[100], n, c, d, position, swap;
+
+  printf("Enter number of elements\n");
+  scanf("%d", &n);
+
+  printf("Enter %d integers\n", n);
+
+  for (c = 0; c < n; c++)
+    scanf("%d", &array[c]);
+
+  for (c = 0; c < (n - 1); c++) {
+    position = c;
+
+    for (d = c + 1; d < n; d++) {
+      if (array[position] > array[d])
+        position = d;
+    }
+    if (position != c) {
+      swap = array[c];
+      array[c] = array[position];
+      array[position] = swap;
+    }
+  }
+
+  printf("Sorted list in ascending order:\n");
+
+  for (c = 0; c < n; c++)
+    printf("%d\n", array[c]);
+
+  return 0;
 }
-	
diff --git a/hpvm/projects/llvm-cbe/test/test001.c b/hpvm/projects/llvm-cbe/test/test001.c
index 817d7ca8cae09d11e57848ee7d3fdb9a7931d19a..8606d141ba73ddce2a598e85c6a787d715b1a5e2 100644
--- a/hpvm/projects/llvm-cbe/test/test001.c
+++ b/hpvm/projects/llvm-cbe/test/test001.c
@@ -11,7 +11,4 @@
 //
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    return 6;
-}
+int main() { return 6; }
diff --git a/hpvm/projects/llvm-cbe/test/test002.c b/hpvm/projects/llvm-cbe/test/test002.c
index 9af3c34ee82cf9517f0f4ed4015a239fdace5cfb..aeb02526f8b2bda1b0bae293d1f006c6a4622641 100644
--- a/hpvm/projects/llvm-cbe/test/test002.c
+++ b/hpvm/projects/llvm-cbe/test/test002.c
@@ -8,14 +8,13 @@
 //===----------------------------------------------------------------------===//
 //
 // This code tests to see that the CBE will execute a for loop correctly.
-// *TW 
+// *TW
 //
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    int i, x = 0;
-    for (i = 0; i < 6; i++)
-        ++x;
-    return x;
+int main() {
+  int i, x = 0;
+  for (i = 0; i < 6; i++)
+    ++x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test003.c b/hpvm/projects/llvm-cbe/test/test003.c
index 4aa8eb6bfb6e4a4e4d67f5fd4f5847cc608d6cae..bfeaef5db7a85f23c746b90f17461fb10dfd87e8 100644
--- a/hpvm/projects/llvm-cbe/test/test003.c
+++ b/hpvm/projects/llvm-cbe/test/test003.c
@@ -11,13 +11,11 @@
 // *TW
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 0, x = 0;
-    while (i < 6) {
-        ++x;
-        ++i;
-    }
-    return x;
-} 
-   
+int main() {
+  int i = 0, x = 0;
+  while (i < 6) {
+    ++x;
+    ++i;
+  }
+  return x;
+}
diff --git a/hpvm/projects/llvm-cbe/test/test004.c b/hpvm/projects/llvm-cbe/test/test004.c
index ba619f09bbaab461723a6f85dca1dfbb28ceac41..35a5a02d83091093a1b251bbb5a7158b11d93244 100644
--- a/hpvm/projects/llvm-cbe/test/test004.c
+++ b/hpvm/projects/llvm-cbe/test/test004.c
@@ -7,15 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute an if/else statement correctly.
-// *TW
+// This code tests to see that the CBE will execute an if/else statement
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int x = 3;
-    x += 3;
-    if (x == 6)
-        return x;
-    else
-        return 0;
+  int x = 3;
+  x += 3;
+  if (x == 6)
+    return x;
+  else
+    return 0;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test005.c b/hpvm/projects/llvm-cbe/test/test005.c
index 8b9323a97e3a27cfb4cc45b17ba26b39c96a180c..a287f075cd3b152e84a0bd24ce35097c5bb231b7 100644
--- a/hpvm/projects/llvm-cbe/test/test005.c
+++ b/hpvm/projects/llvm-cbe/test/test005.c
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int i, j, x = 0;
-    for (i = 0; i < 3; i++)
-        for (j = 0; j < 2; j++)
-            ++x;
-            
-    return x;
+  int i, j, x = 0;
+  for (i = 0; i < 3; i++)
+    for (j = 0; j < 2; j++)
+      ++x;
+
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test006.c b/hpvm/projects/llvm-cbe/test/test006.c
index b513d75d4ab163388f15f156d1387d5b71dfcdf4..fe901d6d19cd2dabd11f66623d1f1ca3d0cf55b9 100644
--- a/hpvm/projects/llvm-cbe/test/test006.c
+++ b/hpvm/projects/llvm-cbe/test/test006.c
@@ -7,18 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a nested while loop correctly.
-// *TW
+// This code tests to see that the CBE will execute a nested while loop
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int i = 0, j = 0, x = 0;
-    while (i < 6) {
-        while (j < 6) {
-            ++x;
-            ++j;
-        }
-      ++i;
+  int i = 0, j = 0, x = 0;
+  while (i < 6) {
+    while (j < 6) {
+      ++x;
+      ++j;
     }
-    return x;
+    ++i;
+  }
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test007.c b/hpvm/projects/llvm-cbe/test/test007.c
index 50c895d18192844c38c53c6e706eb2c4f163713d..b4ff4365db7ad0f48dc9fa2171a818757ba899c1 100644
--- a/hpvm/projects/llvm-cbe/test/test007.c
+++ b/hpvm/projects/llvm-cbe/test/test007.c
@@ -7,27 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a switch statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a switch statement
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-   char var = 'x';
-   
-   switch (var) {
-      case 'z' :
-         return 0;
-         break;
-      case 'y' :
-         return 1;
-         break;
-      case 'x' :
-         return 6;
-         break;
-      case 'w' :
-         return 7;
-         break;
-      default :
-         return 100;
-   }
+  char var = 'x';
+
+  switch (var) {
+  case 'z':
+    return 0;
+    break;
+  case 'y':
+    return 1;
+    break;
+  case 'x':
+    return 6;
+    break;
+  case 'w':
+    return 7;
+    break;
+  default:
+    return 100;
+  }
 }
diff --git a/hpvm/projects/llvm-cbe/test/test008.c b/hpvm/projects/llvm-cbe/test/test008.c
index 283b8f73bafe45c6225e5270249c458b9a75a80d..f054263e0b5490d25b16c53c082d7b0dfbd1793f 100644
--- a/hpvm/projects/llvm-cbe/test/test008.c
+++ b/hpvm/projects/llvm-cbe/test/test008.c
@@ -12,18 +12,18 @@
 //===----------------------------------------------------------------------===//
 
 struct test {
-   int var1;
-   int var2;
-   int var3;
+  int var1;
+  int var2;
+  int var3;
 };
 
 int main() {
 
-   struct test variable;
+  struct test variable;
 
-   variable.var2 = 5;
-   variable.var3 = 6;
-   variable.var1 = 9;
-    
-   return variable.var3;
+  variable.var2 = 5;
+  variable.var3 = 6;
+  variable.var1 = 9;
+
+  return variable.var3;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test009.c b/hpvm/projects/llvm-cbe/test/test009.c
index a46509105cb73430794e55eb9d5af6d0da98ff6f..1b2fc327e2c7fd67ba1520dbecebd4803507c600 100644
--- a/hpvm/projects/llvm-cbe/test/test009.c
+++ b/hpvm/projects/llvm-cbe/test/test009.c
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-    int example[10];
-    int i;
-       for (i = 0;i < 10; ++i) {
-        example[i] = i;
-       }
-       return example[6];
+  int example[10];
+  int i;
+  for (i = 0; i < 10; ++i) {
+    example[i] = i;
+  }
+  return example[6];
 }
diff --git a/hpvm/projects/llvm-cbe/test/test010.c b/hpvm/projects/llvm-cbe/test/test010.c
index e3841e64d3e41aa923201427f6913c3e30a650c9..21c6fdd0c7b6ed0a6c346d01a8e8836a4b2050a5 100644
--- a/hpvm/projects/llvm-cbe/test/test010.c
+++ b/hpvm/projects/llvm-cbe/test/test010.c
@@ -7,37 +7,37 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a nested switch statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a nested switch statement
+// correctly. *TW
 //===----------------------------------------------------------------------===//
 
 int main() {
-   char var = 'x', var2;
-   switch (var) {
-      case 'z' :
-         return 0;
-         break;
-      case 'y' :
-         return 1;
-         break;
-      case 'x' :
-         var2 = 'b';
-    
-         switch (var2) {
-            case 'a' :
-               return 10;
-               break;
-            case 'b' :
-               return 6;
-               break;
-            default :
-               return 18;
-         }
+  char var = 'x', var2;
+  switch (var) {
+  case 'z':
+    return 0;
+    break;
+  case 'y':
+    return 1;
+    break;
+  case 'x':
+    var2 = 'b';
 
-         case 'w' :
-            return 7;
-            break;
-         default :
-            return 100;
-   }
+    switch (var2) {
+    case 'a':
+      return 10;
+      break;
+    case 'b':
+      return 6;
+      break;
+    default:
+      return 18;
+    }
+
+  case 'w':
+    return 7;
+    break;
+  default:
+    return 100;
+  }
 }
diff --git a/hpvm/projects/llvm-cbe/test/test011.c b/hpvm/projects/llvm-cbe/test/test011.c
index aa0ee7229f512c25e2794372eac697c85d35b531..9ff808b7096c728794ed472349b472d5ce61b952 100644
--- a/hpvm/projects/llvm-cbe/test/test011.c
+++ b/hpvm/projects/llvm-cbe/test/test011.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,13 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle addition between two variables.
-// *TW
+// This code tests to see that the CBE can handle addition between two
+// variables. *TW
 //===------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 2, t = 4, x = 0;
-    x = i+t;
+int main() {
+  int i = 2, t = 4, x = 0;
+  x = i + t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test012.c b/hpvm/projects/llvm-cbe/test/test012.c
index 403c635686a51eb493c0ca224c043b6aa6c2fce6..60689156c5bcd5c835aebb0a9c5e0e8d7612d164 100644
--- a/hpvm/projects/llvm-cbe/test/test012.c
+++ b/hpvm/projects/llvm-cbe/test/test012.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,14 @@
 //
 //===----------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle subtraction between two variables.
+// This code tests to see that the CBE can handle subtraction between two
+// variables.
 //  *TW
 //===----------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 8, t = 2, x = 0;
-    x = i-t;
+int main() {
+  int i = 8, t = 2, x = 0;
+  x = i - t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test013.c b/hpvm/projects/llvm-cbe/test/test013.c
index 444d4676b78a2f5324cb9bbccfac67bfcf9330aa..9bb5dc492bc251f11c152eb2ea7b506c3354430c 100644
--- a/hpvm/projects/llvm-cbe/test/test013.c
+++ b/hpvm/projects/llvm-cbe/test/test013.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,13 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle multiplication between two variables.
-// *TW
+// This code tests to see that the CBE can handle multiplication between two
+// variables. *TW
 //===------------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 3, t = 2, x = 0;
-    x = i*t;
+int main() {
+  int i = 3, t = 2, x = 0;
+  x = i * t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test014.c b/hpvm/projects/llvm-cbe/test/test014.c
index e1dc6931f9e989ed867f8abf93dfa5f57042de5c..cbc0ad52d407bfc768a56b7105d4b93a7d2bdaf7 100644
--- a/hpvm/projects/llvm-cbe/test/test014.c
+++ b/hpvm/projects/llvm-cbe/test/test014.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +8,13 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle division between two variables.
-// *TW
+// This code tests to see that the CBE can handle division between two
+// variables. *TW
 //===------------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 30, t = 5, x = 0;
-    x = i/t;
+int main() {
+  int i = 30, t = 5, x = 0;
+  x = i / t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test015.c b/hpvm/projects/llvm-cbe/test/test015.c
index e4c2a5c03b28ca481dd3709e18567237cc12a660..81c2f22808e4f4efcb7a4d031faf6a7e2e197f37 100644
--- a/hpvm/projects/llvm-cbe/test/test015.c
+++ b/hpvm/projects/llvm-cbe/test/test015.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,10 +12,9 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int main()
-{
-    int i = 26, t = 20, x = 0;
-    x = i%t;
+int main() {
+  int i = 26, t = 20, x = 0;
+  x = i % t;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test016.c b/hpvm/projects/llvm-cbe/test/test016.c
index 0841840ebc31ba622a4538f328b658b0bf52e08c..bb5bc64fff2b798375e2c2470e6538d5009c7719 100644
--- a/hpvm/projects/llvm-cbe/test/test016.c
+++ b/hpvm/projects/llvm-cbe/test/test016.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +13,12 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    char ch;
+  char ch;
 
-    if(sizeof(+ch) == 4) {
-        return 6;
-    }
-    return 1;
+  if (sizeof(+ch) == 4) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test017.c b/hpvm/projects/llvm-cbe/test/test017.c
index 0535862b3057ea1cf7ac7ba2801a563a85d75dbe..a87abcd1e8f3311be495deb7bcf369f01ceeaa7f 100644
--- a/hpvm/projects/llvm-cbe/test/test017.c
+++ b/hpvm/projects/llvm-cbe/test/test017.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,11 +14,11 @@
 
 int main() {
 
-    signed int a = 10;
-    signed int b = -a;
+  signed int a = 10;
+  signed int b = -a;
 
-    if(b == -10) {
-        return 6;
-    }
-        return 1;
+  if (b == -10) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test018.c b/hpvm/projects/llvm-cbe/test/test018.c
index c02efa9d0e914b96a6d491769e1a22e2e2747047..ea38b291393f20192f1885bdd702ef321b6929f0 100644
--- a/hpvm/projects/llvm-cbe/test/test018.c
+++ b/hpvm/projects/llvm-cbe/test/test018.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,15 +8,15 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle the incremental (++a) operator.
-// *TW
+// This code tests to see that the CBE can handle the incremental (++a)
+// operator. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 5;
+  int x = 5;
 
-    ++x;
+  ++x;
 
-    return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test019.c b/hpvm/projects/llvm-cbe/test/test019.c
index 1975bb9c5b3e0eaae7a1417310da435f3df8a0d6..484fe0481656cba546bee1565e110f1a0dc90327 100644
--- a/hpvm/projects/llvm-cbe/test/test019.c
+++ b/hpvm/projects/llvm-cbe/test/test019.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,15 +8,15 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle the decremental (--a) operator.
-// *TW
+// This code tests to see that the CBE can handle the decremental (--a)
+// operator. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 7;
-   
-    --x;
+  int x = 7;
 
-    return x;
+  --x;
+
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test020.c b/hpvm/projects/llvm-cbe/test/test020.c
index a68801708d9b628dae8dc3b5dba130f86436bdb6..98ed7f1701cdfdf442e6253706f2bc2f1f30227f 100644
--- a/hpvm/projects/llvm-cbe/test/test020.c
+++ b/hpvm/projects/llvm-cbe/test/test020.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 3;
+  int x = 6;
+  int y = 3;
 
-    if(x > y){
-        return x;
-    }
-    return 1;
+  if (x > y) {
+    return x;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test021.c b/hpvm/projects/llvm-cbe/test/test021.c
index 93eed31d9bbfc11ad0559dcaf649df9e1f9206c1..0c5e63a462482f6b2f5cc392c2508a3076c3c3e6 100644
--- a/hpvm/projects/llvm-cbe/test/test021.c
+++ b/hpvm/projects/llvm-cbe/test/test021.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
+  int x = 6;
+  int y = 6;
 
-    if(x >= y){
-        return x;
-    }
-    return 1;
+  if (x >= y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test022.c b/hpvm/projects/llvm-cbe/test/test022.c
index 895069a83bc7b1c1df9d6f27784187940b493b35..1578e158914dd68f5b99c4c69b36e19f23217939 100644
--- a/hpvm/projects/llvm-cbe/test/test022.c
+++ b/hpvm/projects/llvm-cbe/test/test022.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 12;
+  int x = 6;
+  int y = 12;
 
-
-    if(x < y){
-        return x;
-    }
-    return 1;
+  if (x < y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test023.c b/hpvm/projects/llvm-cbe/test/test023.c
index 52348d3e1690624aa712ec6735e811f4ab958055..bc309ddb015a9af75cfcd3f09a7c5f5e093ba981 100644
--- a/hpvm/projects/llvm-cbe/test/test023.c
+++ b/hpvm/projects/llvm-cbe/test/test023.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
+  int x = 6;
+  int y = 6;
 
-    if(x <= y){
-        return x;
-    }
-    return 1;
+  if (x <= y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test024.c b/hpvm/projects/llvm-cbe/test/test024.c
index 2c90879b87e3646db441c47e114b04f05de134a8..782d41a47880e3078af3d3775923870efba0a915 100644
--- a/hpvm/projects/llvm-cbe/test/test024.c
+++ b/hpvm/projects/llvm-cbe/test/test024.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
+  int x = 6;
+  int y = 6;
 
-    if(x == y){
-        return x;
-    }
-    return 1;
+  if (x == y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test025.c b/hpvm/projects/llvm-cbe/test/test025.c
index 153cb4013477a27ad95248d38c62aa45bb2d5206..26bedf78ca25c8da2a1ba9c12694ddbdba087033 100644
--- a/hpvm/projects/llvm-cbe/test/test025.c
+++ b/hpvm/projects/llvm-cbe/test/test025.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 2;
+  int x = 6;
+  int y = 2;
 
-    if(x != y){
-        return x;
-    }
-    return 1;
+  if (x != y) {
+    return x;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test026.c b/hpvm/projects/llvm-cbe/test/test026.c
index 874c06957d200d2402974e3928aae339f5c2d16c..cf0b3e6ae94f24c8392a7b6f91a7dade1c1a6613 100644
--- a/hpvm/projects/llvm-cbe/test/test026.c
+++ b/hpvm/projects/llvm-cbe/test/test026.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-int x = 6;
-int y = 6;
-int z = 6;
+  int x = 6;
+  int y = 6;
+  int z = 6;
 
-    if(x == y && x == z){
-        return 6;
-    }
-    return 1;
+  if (x == y && x == z) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test027.c b/hpvm/projects/llvm-cbe/test/test027.c
index d1322597c34d5d3d284ae8d4de203b4bc769f998..f1e0adb31dc38ed4d35a25590281e3a9ac505474 100644
--- a/hpvm/projects/llvm-cbe/test/test027.c
+++ b/hpvm/projects/llvm-cbe/test/test027.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int x = 6;
-    int y = 6;
-    int z = 6;
+  int x = 6;
+  int y = 6;
+  int z = 6;
 
-    if(x == y || x != z){
-        return 6;
-    }
-    return 1;
+  if (x == y || x != z) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test028.c b/hpvm/projects/llvm-cbe/test/test028.c
index ce77d792f3b2e75d3795784a3a932c37c120c764..7e2ecdcf3f66c4637a15b2d2d19ddd3b5e740469 100644
--- a/hpvm/projects/llvm-cbe/test/test028.c
+++ b/hpvm/projects/llvm-cbe/test/test028.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = -7;
-    unsigned int b = 0;
+  unsigned int a = -7;
+  unsigned int b = 0;
 
-    b = ~a;
-    if( b == 6){
-        return 6;
-    }
-    return 1;
+  b = ~a;
+  if (b == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test029.c b/hpvm/projects/llvm-cbe/test/test029.c
index b7ac93ecf5f275ba5129a9bed6988b694cc0ca39..34d1ff5c8be474ccc629e9d10c51ca30b5cb8c10 100644
--- a/hpvm/projects/llvm-cbe/test/test029.c
+++ b/hpvm/projects/llvm-cbe/test/test029.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,15 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 6;  //0110
-    unsigned int b = 15; //1111
-    unsigned int c = 0;
+  unsigned int a = 6;  // 0110
+  unsigned int b = 15; // 1111
+  unsigned int c = 0;
 
-    c = a&b;
-    if(c == 6){
-        return 6;
-    }
-    return 1;
+  c = a & b;
+  if (c == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test030.c b/hpvm/projects/llvm-cbe/test/test030.c
index 333ce5aa01915623200c600249ebb2377782a139..a88c910f8f25d85785ed5c5f03578157fc13b47d 100644
--- a/hpvm/projects/llvm-cbe/test/test030.c
+++ b/hpvm/projects/llvm-cbe/test/test030.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,15 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-unsigned int a = 2;
-unsigned int b = 4;
-unsigned int c = 0;
+  unsigned int a = 2;
+  unsigned int b = 4;
+  unsigned int c = 0;
 
-    c = a|b;
-    if(c == 6){
-        return 6;
-    }
-    return 1;
+  c = a | b;
+  if (c == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test031.c b/hpvm/projects/llvm-cbe/test/test031.c
index 69d0dab0e1ff78ff13fce90e3ae68de355b7cf19..6e13a9f03fae2a6b47e6fa6000e7fcea7a74b8d1 100644
--- a/hpvm/projects/llvm-cbe/test/test031.c
+++ b/hpvm/projects/llvm-cbe/test/test031.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,15 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 9;  //1001
-    unsigned int b = 15; //1111
-    unsigned int c = 0;
+  unsigned int a = 9;  // 1001
+  unsigned int b = 15; // 1111
+  unsigned int c = 0;
 
-
-    c = a^b;
-    if(c == 6){
-        return 6;
-    }
-    return 1;
+  c = a ^ b;
+  if (c == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test032.c b/hpvm/projects/llvm-cbe/test/test032.c
index ae63e2c4d26864d0bec5dc89dc6a80174e89c985..a98ab650e98bd0e147825bca50dca1bfcaea5809 100644
--- a/hpvm/projects/llvm-cbe/test/test032.c
+++ b/hpvm/projects/llvm-cbe/test/test032.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,16 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 3;  //0011
-    unsigned int b = 0;
+  unsigned int a = 3; // 0011
+  unsigned int b = 0;
 
-
-    b = a << 1;  //0110
-    if(b == 6){
-          return 6;
-    }
-    return 1;
+  b = a << 1; // 0110
+  if (b == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test033.c b/hpvm/projects/llvm-cbe/test/test033.c
index 1bb96d21bdef67392305cf8631aa7243ab77cb98..81b4177184b79eb5e282cafea5b00c60ce48a5a4 100644
--- a/hpvm/projects/llvm-cbe/test/test033.c
+++ b/hpvm/projects/llvm-cbe/test/test033.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 13;  //1100
-    unsigned int b = 0;
+  unsigned int a = 13; // 1100
+  unsigned int b = 0;
 
-    b = a >> 1;  //0110
-    if(b == 6){
-          return 6;
-    }
-    return 1;
+  b = a >> 1; // 0110
+  if (b == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test034.c b/hpvm/projects/llvm-cbe/test/test034.c
index dd9106b0be38c88c2a9a10a5c937c2c373ed5eed..977bf40358d94bb0bb10a66938b4f21b500aab57 100644
--- a/hpvm/projects/llvm-cbe/test/test034.c
+++ b/hpvm/projects/llvm-cbe/test/test034.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 3;
-    int b = 3;
- 
-    a+=b;
-    if(a == 6){
-          return 6;
-    }
-    return 1;
+  int a = 3;
+  int b = 3;
+
+  a += b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test035.c b/hpvm/projects/llvm-cbe/test/test035.c
index d1c0ae391f15a0a8b3a118b2435667329ca86a85..8a7f23e17b1bc6f7569cc3352d6898ea033e40a5 100644
--- a/hpvm/projects/llvm-cbe/test/test035.c
+++ b/hpvm/projects/llvm-cbe/test/test035.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 9;
-    int b = 3;
+  int a = 9;
+  int b = 3;
 
-    a-=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a -= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test036.c b/hpvm/projects/llvm-cbe/test/test036.c
index d8d5a1957c84d7afa5cc9d96a6afd64297172f21..019722660fb4641ddfbe13d3d5fcec7aa06102c6 100644
--- a/hpvm/projects/llvm-cbe/test/test036.c
+++ b/hpvm/projects/llvm-cbe/test/test036.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,15 +12,14 @@
 // Compound Multiplication Assignment(a*=b) operator.
 // *TW
 //===------------------------------------------------------------------------===//
-int main(){
+int main() {
 
-    int a = 2;
-    int b = 3;
+  int a = 2;
+  int b = 3;
 
-    a*=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a *= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test037.c b/hpvm/projects/llvm-cbe/test/test037.c
index 5bf5ee705a44ec929a70d0176690733bcc1fbcb8..2363c91ce91768e7dee329eec5db68beedf8076f 100644
--- a/hpvm/projects/llvm-cbe/test/test037.c
+++ b/hpvm/projects/llvm-cbe/test/test037.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 30;
-    int b = 5;
+  int a = 30;
+  int b = 5;
 
-    a/=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a /= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test038.c b/hpvm/projects/llvm-cbe/test/test038.c
index efbe23460710f9f7108b18ba5e60ace5168338a6..1d6aa395aac2994f6bca35de131941154650038b 100644
--- a/hpvm/projects/llvm-cbe/test/test038.c
+++ b/hpvm/projects/llvm-cbe/test/test038.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    int a = 20;
-    int b = 14;
+  int a = 20;
+  int b = 14;
 
-    a%=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a %= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test039.c b/hpvm/projects/llvm-cbe/test/test039.c
index 112d7f69700f6d7bfad21bc58d91506f4d95b68e..53d4fcb9133bdf57967034a7ad02b231088c59ad 100644
--- a/hpvm/projects/llvm-cbe/test/test039.c
+++ b/hpvm/projects/llvm-cbe/test/test039.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 6;  //0110
-    unsigned int b = 15; //1111
+  unsigned int a = 6;  // 0110
+  unsigned int b = 15; // 1111
 
-    a&=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a &= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test040.c b/hpvm/projects/llvm-cbe/test/test040.c
index 5285fb73ecf5d3572c9cbf279be94debc6043e85..d174e7e88041eaca97dc2acb13d218c7ea8baba9 100644
--- a/hpvm/projects/llvm-cbe/test/test040.c
+++ b/hpvm/projects/llvm-cbe/test/test040.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 2;
-    unsigned int b = 4;
+  unsigned int a = 2;
+  unsigned int b = 4;
 
-    a|=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a |= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test041.c b/hpvm/projects/llvm-cbe/test/test041.c
index f04e682356dff0739053b0e91857805b952c8aec..45f64966d499f07f09a6b3904a8c2cd9fc9c71a0 100644
--- a/hpvm/projects/llvm-cbe/test/test041.c
+++ b/hpvm/projects/llvm-cbe/test/test041.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +13,14 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 9;  //1001
-    unsigned int b = 15; //1111
+  unsigned int a = 9;  // 1001
+  unsigned int b = 15; // 1111
 
-    a^=b;
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a ^= b;
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test042.c b/hpvm/projects/llvm-cbe/test/test042.c
index 5b4f12d80882f347efbe1ae59103f7eaf672c464..ec2547370b90902fffb0e61564bb02c782e20fd9 100644
--- a/hpvm/projects/llvm-cbe/test/test042.c
+++ b/hpvm/projects/llvm-cbe/test/test042.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 3;  //0011
+  unsigned int a = 3; // 0011
 
-    a <<= 1;  //0110
-    if( a == 6){
-        return 6;
-    }
-    return 1;
+  a <<= 1; // 0110
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test043.c b/hpvm/projects/llvm-cbe/test/test043.c
index 3b42179304a1741c268d8adf90192bafc5a2ba98..6aeb7bd17c9f40b6f86cdf1fd8ea2dbe520ce554 100644
--- a/hpvm/projects/llvm-cbe/test/test043.c
+++ b/hpvm/projects/llvm-cbe/test/test043.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 13;  //1100
+  unsigned int a = 13; // 1100
 
-    a >>= 1;  //0110
-    if(a == 6){
-        return 6;
-    }
-    return 1;
+  a >>= 1; // 0110
+  if (a == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test044.c b/hpvm/projects/llvm-cbe/test/test044.c
index dbb9d31ad940421d1d33d21af916eccd672ac2a8..f9b7c2d4632326b81ca526a82765152039269fba 100644
--- a/hpvm/projects/llvm-cbe/test/test044.c
+++ b/hpvm/projects/llvm-cbe/test/test044.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,17 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a char.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// char. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    char a = 'A' ; //65
-    int ia = 0;
+  char a = 'A'; // 65
+  int ia = 0;
 
-    ia = a;
-    ia-=59;
+  ia = a;
+  ia -= 59;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test045.c b/hpvm/projects/llvm-cbe/test/test045.c
index 50aaa8effcd3994d1dd47213d25748b1293f49f0..c8b57993a7edcbc691c641b1405f6f3ae137b65e 100644
--- a/hpvm/projects/llvm-cbe/test/test045.c
+++ b/hpvm/projects/llvm-cbe/test/test045.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed char a = 'A';
-    int ia = 0;
+  signed char a = 'A';
+  int ia = 0;
 
-    ia = a;
-    ia-=59;
+  ia = a;
+  ia -= 59;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test046.c b/hpvm/projects/llvm-cbe/test/test046.c
index ea57085caf034bdaf554169c439d21cabdfc1606..edbfe837fe615cea8ce58d5e7732da49632cf66c 100644
--- a/hpvm/projects/llvm-cbe/test/test046.c
+++ b/hpvm/projects/llvm-cbe/test/test046.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,13 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned char a = 'A';
-    int ia = 0;
+  unsigned char a = 'A';
+  int ia = 0;
 
-    ia = a;
-    ia-=59;
+  ia = a;
+  ia -= 59;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test047.c b/hpvm/projects/llvm-cbe/test/test047.c
index 2b90d14c7f9b195cdb94611925644cd4debb99ea..476cea234f53c18c684c9069c0675fc2effe48d3 100644
--- a/hpvm/projects/llvm-cbe/test/test047.c
+++ b/hpvm/projects/llvm-cbe/test/test047.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +8,12 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning an int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning an
+// int. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-	int a = 6;
-        return a;
+  int a = 6;
+  return a;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test048.c b/hpvm/projects/llvm-cbe/test/test048.c
index c30694ff502502de722f99e2f8a21cfe79ddf17c..ee3966afccc9d7e3f0aaff62aec16142e28a601e 100644
--- a/hpvm/projects/llvm-cbe/test/test048.c
+++ b/hpvm/projects/llvm-cbe/test/test048.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +13,11 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    short int a = 6;
-    int ia = 0;
-    ia = (int)a;
+  short int a = 6;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test049.c b/hpvm/projects/llvm-cbe/test/test049.c
index bb4a0801981e734a36518cf406fc8edd2213d0cd..5f29feffc05704adf39078d49177ab5edb5cffcf 100644
--- a/hpvm/projects/llvm-cbe/test/test049.c
+++ b/hpvm/projects/llvm-cbe/test/test049.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +13,11 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long int a = 6;
-    int ia = 0;
-    ia = (int)a;
+  long int a = 6;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test050.c b/hpvm/projects/llvm-cbe/test/test050.c
index f69c7cee23cbc47535ce653f28a26305195541e4..aa49757a320855970290cdef405a495052405ffe 100644
--- a/hpvm/projects/llvm-cbe/test/test050.c
+++ b/hpvm/projects/llvm-cbe/test/test050.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,11 +13,11 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed int a = 6;
-    int ia = 0;
-    ia = (int)a;
+  signed int a = 6;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test051.c b/hpvm/projects/llvm-cbe/test/test051.c
index 61f1e03d57d03c6a298ed50880d074e4f58e9e9a..0334eafdf30b2be0c6cde7dfeadaaf074d943608 100644
--- a/hpvm/projects/llvm-cbe/test/test051.c
+++ b/hpvm/projects/llvm-cbe/test/test051.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +13,12 @@
 // *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned int a = 6;
+  unsigned int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test052.c b/hpvm/projects/llvm-cbe/test/test052.c
index 48e1ce67f8edf77d03d1a9c75d72352b26f06511..3230b192b7b70080bb12318c983b86dcc4b3159b 100644
--- a/hpvm/projects/llvm-cbe/test/test052.c
+++ b/hpvm/projects/llvm-cbe/test/test052.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a float.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// float. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    float a = 6.0;
+  float a = 6.0;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test053.c b/hpvm/projects/llvm-cbe/test/test053.c
index 86dd5691a77f96fcd6e8568d22e93ef3a160872b..4ea19186428065a3813addb8537d5331c2709015 100644
--- a/hpvm/projects/llvm-cbe/test/test053.c
+++ b/hpvm/projects/llvm-cbe/test/test053.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a double.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// double. *TW
 //===------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    double a = 6.0;
+  double a = 6.0;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test054.c b/hpvm/projects/llvm-cbe/test/test054.c
index 4c86601412f5db9c6ec818f6029374dacaeebb60..caa7d00080554531f087c950da49975854b2d4aa 100644
--- a/hpvm/projects/llvm-cbe/test/test054.c
+++ b/hpvm/projects/llvm-cbe/test/test054.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long double.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a long
+// double. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long double a = 6.0;
+  long double a = 6.0;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test055.c b/hpvm/projects/llvm-cbe/test/test055.c
index cd7891acfe29906d1c0f6b6e9462ba2a95d8e747..4b85082d3353dea43862a1a0db736f1c43bf91fd 100644
--- a/hpvm/projects/llvm-cbe/test/test055.c
+++ b/hpvm/projects/llvm-cbe/test/test055.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a short.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// short. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    short a = 6;
+  short a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test056.c b/hpvm/projects/llvm-cbe/test/test056.c
index b12df1df990921aa6586cb6c4328733098c89386..305f044be1a342cd921f5bdc9ab22d938e172103 100644
--- a/hpvm/projects/llvm-cbe/test/test056.c
+++ b/hpvm/projects/llvm-cbe/test/test056.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed short.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed short. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed short a = 6;
+  signed short a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test057.c b/hpvm/projects/llvm-cbe/test/test057.c
index 50678081ec9ba22cc6274de0c6be137f913704bb..280ec876bda3a5ffb004d0eb14afa47250253c7d 100644
--- a/hpvm/projects/llvm-cbe/test/test057.c
+++ b/hpvm/projects/llvm-cbe/test/test057.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning an unsigned short.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning an
+// unsigned short. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned short a = 6;
+  unsigned short a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test058.c b/hpvm/projects/llvm-cbe/test/test058.c
index cdbfac068fe5aa9639691dcf869d016213e7dff7..f5404bd8336012b80e9f3d02074b4bd390924a84 100644
--- a/hpvm/projects/llvm-cbe/test/test058.c
+++ b/hpvm/projects/llvm-cbe/test/test058.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed short int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed short int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed short int a = 6;
+  signed short int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test059.c b/hpvm/projects/llvm-cbe/test/test059.c
index 4de964a13ec97e47ff09e618ac6e9c232d9acf35..13b3ac08797e64625e89a9404cd35a0c27d21203 100644
--- a/hpvm/projects/llvm-cbe/test/test059.c
+++ b/hpvm/projects/llvm-cbe/test/test059.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a unsigned short int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// unsigned short int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned short int a = 6;
+  unsigned short int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test060.c b/hpvm/projects/llvm-cbe/test/test060.c
index a0a6e16949f5730787137cbdf0bf5284ae6d292a..ecb393f2f368e5137d164a2996837db204c2f9f4 100644
--- a/hpvm/projects/llvm-cbe/test/test060.c
+++ b/hpvm/projects/llvm-cbe/test/test060.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long a = 6;
+  long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test061.c b/hpvm/projects/llvm-cbe/test/test061.c
index d1bf812aa0c3312a6b0dfafd2e59866ec5bdc236..ac7cadd45fe6e5148c41f38dee679ee8bddad2e3 100644
--- a/hpvm/projects/llvm-cbe/test/test061.c
+++ b/hpvm/projects/llvm-cbe/test/test061.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long a = 6;
+  signed long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test062.c b/hpvm/projects/llvm-cbe/test/test062.c
index 077ace8b321d7c9bd6a865bf0b2adb9bf892a3be..eaaf59853f711197b7049a993c48b939cbfab608 100644
--- a/hpvm/projects/llvm-cbe/test/test062.c
+++ b/hpvm/projects/llvm-cbe/test/test062.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a unsigned long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// unsigned long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long a = 6;
+  unsigned long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test063.c b/hpvm/projects/llvm-cbe/test/test063.c
index 78fbe390f5e05fbbf35f149bf6dc3f56ecd69549..fa6cd18e88bef646c55391a68564355302e55775 100644
--- a/hpvm/projects/llvm-cbe/test/test063.c
+++ b/hpvm/projects/llvm-cbe/test/test063.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed long int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed long int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long int a = 6;
+  signed long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test064.c b/hpvm/projects/llvm-cbe/test/test064.c
index c26a3da001557d18686ca20ec4de52bdd8e5e765..05a72b4b9a937ed87262c6ad40a9a24238b201dd 100644
--- a/hpvm/projects/llvm-cbe/test/test064.c
+++ b/hpvm/projects/llvm-cbe/test/test064.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +13,12 @@
 // *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long int a = 6;
+  unsigned long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test065.c b/hpvm/projects/llvm-cbe/test/test065.c
index d9b299752c54e9238c6e2171342bb6f1c470163b..76958db4c2fe457f52cccac60f8dd3f49f8a868d 100644
--- a/hpvm/projects/llvm-cbe/test/test065.c
+++ b/hpvm/projects/llvm-cbe/test/test065.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a long
+// long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long long a = 6;
+  long long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test066.c b/hpvm/projects/llvm-cbe/test/test066.c
index b4adc62240751fac572c9c1cede33279f51c7c90..10ec61f56ec72432bc43d8ae8af85226cd3f08e8 100644
--- a/hpvm/projects/llvm-cbe/test/test066.c
+++ b/hpvm/projects/llvm-cbe/test/test066.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a long long int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a long
+// long int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    long long int a = 6;
+  long long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test067.c b/hpvm/projects/llvm-cbe/test/test067.c
index 9d786b521063454dc42ba609fc742091ba3df1bb..e90cc8caea23b2baec248752eb84fe3c9afd3479 100644
--- a/hpvm/projects/llvm-cbe/test/test067.c
+++ b/hpvm/projects/llvm-cbe/test/test067.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a signed long long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// signed long long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long long  a = 6;
+  signed long long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test068.c b/hpvm/projects/llvm-cbe/test/test068.c
index 1f72ecd1b7fa39c845c159ad3b9d86ce44d72547..5c0daa8a157d2ada06f4a1a4f2c66e1f3ac35354 100644
--- a/hpvm/projects/llvm-cbe/test/test068.c
+++ b/hpvm/projects/llvm-cbe/test/test068.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning a unsigned long long.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning a
+// unsigned long long. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long long  a = 6;
+  unsigned long long a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test069.c b/hpvm/projects/llvm-cbe/test/test069.c
index bc611f13c1552f42c38cbf054134fe8fc6f37e24..6cae210ec65e8c1fe9c3827e49ed1147cfe42d22 100644
--- a/hpvm/projects/llvm-cbe/test/test069.c
+++ b/hpvm/projects/llvm-cbe/test/test069.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +13,12 @@
 // *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    signed long long int a = 6;
+  signed long long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test070.c b/hpvm/projects/llvm-cbe/test/test070.c
index 94c42bd8b5b4afee99d0cf18bec1806ceead963e..e9b55e232f54c9ac5ba6eea1e10a3babb88fb791 100644
--- a/hpvm/projects/llvm-cbe/test/test070.c
+++ b/hpvm/projects/llvm-cbe/test/test070.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===-------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE can handle declaring and returning an unsigned long long int.
-// *TW
+// This code tests to see that the CBE can handle declaring and returning an
+// unsigned long long int. *TW
 //===-------------------------------------------------------------------------------===//
 
-int main(){
+int main() {
 
-    unsigned long long int a = 6;
+  unsigned long long int a = 6;
 
-    int ia = 0;
-    ia = (int)a;
+  int ia = 0;
+  ia = (int)a;
 
-    return ia;
+  return ia;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test071.c b/hpvm/projects/llvm-cbe/test/test071.c
index 3e090147c7e09ed0ce208e305659083a17a31f81..357bc1e53330345808b5cf966bd9bbd4827f89af 100644
--- a/hpvm/projects/llvm-cbe/test/test071.c
+++ b/hpvm/projects/llvm-cbe/test/test071.c
@@ -8,13 +8,13 @@
 //===----------------------------------------------------------------------===//
 //
 // This code tests to see that the CBE will execute an if statement correctly.
-// *TW 
+// *TW
 //
 //===----------------------------------------------------------------------===//
 
 int main() {
-   int x = 6;
-   if (x == 6)
-      return x;
-	return 0;
+  int x = 6;
+  if (x == 6)
+    return x;
+  return 0;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test072.c b/hpvm/projects/llvm-cbe/test/test072.c
index 7c7cbcb391bb51a53e20bfae8aabb23cf0bc2ef6..87cbd91bb0591e452c9ada49c94968ab63c84c64 100644
--- a/hpvm/projects/llvm-cbe/test/test072.c
+++ b/hpvm/projects/llvm-cbe/test/test072.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,18 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute an else-if statement correctly.
-// *TW
+// This code tests to see that the CBE will execute an else-if statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x = 6;
-   if (x == 4) {
-      return 2;
-   } else if (x == 6){
-        return 6;
-     } else {
-          return 8;
-     }
+  int x = 6;
+  if (x == 4) {
+    return 2;
+  } else if (x == 6) {
+    return 6;
+  } else {
+    return 8;
+  }
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test073.c b/hpvm/projects/llvm-cbe/test/test073.c
index 006a7348e87c6259a41227f731542cdfe1f931d2..2e664c4c73bfe827019ff0cc3aae9e9f4037155d 100644
--- a/hpvm/projects/llvm-cbe/test/test073.c
+++ b/hpvm/projects/llvm-cbe/test/test073.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +8,16 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a do-while statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a do-while statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x = 0;
-   do {
-      x++;
-   } while (x < 6);
+  int x = 0;
+  do {
+    x++;
+  } while (x < 6);
 
-   return x;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test074.c b/hpvm/projects/llvm-cbe/test/test074.c
index bb3ff37858bdc25554481a48810c597f7b2f176e..903af81861c08822d9b9c9078bd0cd14d977f612 100644
--- a/hpvm/projects/llvm-cbe/test/test074.c
+++ b/hpvm/projects/llvm-cbe/test/test074.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,18 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a break/continue statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a break/continue statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x;
-   for (x=0; x<=25; x++) {
-      if (x == 6)
-         break;
-      if (x < 15)
-         continue;
-   }
-   return x;
+  int x;
+  for (x = 0; x <= 25; x++) {
+    if (x == 6)
+      break;
+    if (x < 15)
+      continue;
+  }
+  return x;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test075.c b/hpvm/projects/llvm-cbe/test/test075.c
index a0601622c2f897e513615bcbd9b0a91176a26a5b..55562b99efb1414e10e139188919ee1c03e9133e 100644
--- a/hpvm/projects/llvm-cbe/test/test075.c
+++ b/hpvm/projects/llvm-cbe/test/test075.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C --------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//--------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +8,21 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a Goto-Label statement correctly.
-// *TW
+// This code tests to see that the CBE will execute a Goto-Label statement
+// correctly. *TW
 //
 //===---------------------------------------------------------------------------===//
 
 int main() {
-   int x = 0;
-   goto label;
-   
-   for(;;) {
-      x = 10;
-      return x;
-   }
+  int x = 0;
+  goto label;
 
-   label:
-   x = 6;
-   return x;
+  for (;;) {
+    x = 10;
+    return x;
+  }
 
+label:
+  x = 6;
+  return x;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test076.c b/hpvm/projects/llvm-cbe/test/test076.c
index d5f149eb3b51471ce23d8b9baa3186d58e509b44..faf56a3e37e14594fbfbfc3894e504989fcfdea1 100644
--- a/hpvm/projects/llvm-cbe/test/test076.c
+++ b/hpvm/projects/llvm-cbe/test/test076.c
@@ -15,8 +15,8 @@
 
 int main() {
 
-   int x = 6, y = 0, *ip = 0;
-   ip = &x;
-   y = *ip;
-   return y;
+  int x = 6, y = 0, *ip = 0;
+  ip = &x;
+  y = *ip;
+  return y;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test077.c b/hpvm/projects/llvm-cbe/test/test077.c
index a6e1fc7985b1bb5c9d1ac6943533d5c523a7b18d..771463d5afe5dafd1b3975697d807ab3767b3915 100644
--- a/hpvm/projects/llvm-cbe/test/test077.c
+++ b/hpvm/projects/llvm-cbe/test/test077.c
@@ -14,11 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-   char x = 'a', y = 'b', *cp;
-   cp = &x;
-   y = *cp;
-   if (y == 'a'){
-      return 6;
-   }
-   return 1;
+  char x = 'a', y = 'b', *cp;
+  cp = &x;
+  y = *cp;
+  if (y == 'a') {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test078.c b/hpvm/projects/llvm-cbe/test/test078.c
index cc60c18e34b3ecf4a18401fe604d1b45d1f8d1b1..f511a93fd34e3f62ea2d9a515cc6dd70449dc0b3 100644
--- a/hpvm/projects/llvm-cbe/test/test078.c
+++ b/hpvm/projects/llvm-cbe/test/test078.c
@@ -15,9 +15,9 @@
 #include <stddef.h>
 
 int main() {
-   int *ptr = NULL;
-    if (ptr == 0){
-        return 6;
-    }
-    return 1;
+  int *ptr = NULL;
+  if (ptr == 0) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test079.c b/hpvm/projects/llvm-cbe/test/test079.c
index fd1ea110398c435cb9276fd064f6190aba0b5470..12b3477e32ae06f3be29a89b0fc18bac338a0b57 100644
--- a/hpvm/projects/llvm-cbe/test/test079.c
+++ b/hpvm/projects/llvm-cbe/test/test079.c
@@ -14,12 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-   double x = 6, y = 0, *dp;
-   dp = &x;
-   y = *dp;
-   if (y == 6){
-      return 6;
-   }
-   return 1;
+  double x = 6, y = 0, *dp;
+  dp = &x;
+  y = *dp;
+  if (y == 6) {
+    return 6;
+  }
+  return 1;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test080.c b/hpvm/projects/llvm-cbe/test/test080.c
index b7fb855bf45dc87a03bd9ca24a785516250ead4b..9b42fab5d93a392064b8a22bf97d325cd0ae24cc 100644
--- a/hpvm/projects/llvm-cbe/test/test080.c
+++ b/hpvm/projects/llvm-cbe/test/test080.c
@@ -14,11 +14,11 @@
 //===----------------------------------------------------------------------===//
 
 int main() {
-   float x = 6, y = 0, *fp;
-   fp = &x;
-   y = *fp;
-   if (y == 6){
-      return 6;
-   }
-   return 1;
+  float x = 6, y = 0, *fp;
+  fp = &x;
+  y = *fp;
+  if (y == 6) {
+    return 6;
+  }
+  return 1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test081.c b/hpvm/projects/llvm-cbe/test/test081.c
index 6efcad46eadbb23dcc85b7f5aefa43855383bbe9..e032f57f519165cb7a35578a68babe127edea66f 100644
--- a/hpvm/projects/llvm-cbe/test/test081.c
+++ b/hpvm/projects/llvm-cbe/test/test081.c
@@ -7,17 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly use the address-of value (&)
-// variable and and return the value-at address (*) variable from integer 'num'.
-// *TW
+// This code tests to see that the CBE will properly use the address-of value
+// (&) variable and and return the value-at address (*) variable from integer
+// 'num'. *TW
 //
 //===----------------------------------------------------------------------===//
 
-int main(){
-   int *ptr;
-   int num = 6;
-   ptr = &num;
-   int deref = *ptr;
-   return deref;
-
+int main() {
+  int *ptr;
+  int num = 6;
+  ptr = &num;
+  int deref = *ptr;
+  return deref;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test082.c b/hpvm/projects/llvm-cbe/test/test082.c
index e30bb7a2f192adc03fd646bc625340c847cfe92c..7a7bf109eb1dc2b29ab1e66c4b1b804df6e586e3 100644
--- a/hpvm/projects/llvm-cbe/test/test082.c
+++ b/hpvm/projects/llvm-cbe/test/test082.c
@@ -13,13 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-struct Number{
-   int price;
+struct Number {
+  int price;
 };
 
-int main(){
-   struct Number a;
-   struct Number* ptr = &a;
-   ptr->price = 6;
-   return ptr->price;
+int main() {
+  struct Number a;
+  struct Number *ptr = &a;
+  ptr->price = 6;
+  return ptr->price;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test083.c b/hpvm/projects/llvm-cbe/test/test083.c
index 5dc920edf485b69c20b78886a4eb2229af9151ad..58eb4c14a3dcd37623a1bd972705ac0e4cd46703 100644
--- a/hpvm/projects/llvm-cbe/test/test083.c
+++ b/hpvm/projects/llvm-cbe/test/test083.c
@@ -13,12 +13,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-int main(){
-   int *ip;
-   int a[2];
-   a[0] = 1;
-   a[1] = 6;
-   ip = &a[1];
+int main() {
+  int *ip;
+  int a[2];
+  a[0] = 1;
+  a[1] = 6;
+  ip = &a[1];
 
-   return *ip;
+  return *ip;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test084.c b/hpvm/projects/llvm-cbe/test/test084.c
index 6f5b3ad6d9cc526c609719308fa1da9a8ab6ab47..3a67fc1ef9cef5b1340e68eb41d93500000c5a26 100644
--- a/hpvm/projects/llvm-cbe/test/test084.c
+++ b/hpvm/projects/llvm-cbe/test/test084.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly increment a pointer via int.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly increment a pointer via
+// int. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //
 //===---------------------------------------------------------------------------===//
 
@@ -20,9 +21,9 @@ int main() {
   intptr_t inc0 = 0, inc1 = 0, diff = 0, a = 100;
   intptr_t *p = &a;
   inc0 = (intptr_t)p;
-  ++(*p++);  //++(*p++);
+  ++(*p++); //++(*p++);
   inc1 = (intptr_t)p;
-  diff =  inc1-inc0;
+  diff = inc1 - inc0;
   diff += 2;
   return diff;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test085.c b/hpvm/projects/llvm-cbe/test/test085.c
index 01e8d65e6cbb83bc21b46ff9c493284f8e41d2cd..04c47b83d6bce8e9cb8dba4deff3171ece95b46c 100644
--- a/hpvm/projects/llvm-cbe/test/test085.c
+++ b/hpvm/projects/llvm-cbe/test/test085.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly decrement a pointer via int.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly decrement a pointer via
+// int. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //
 //===---------------------------------------------------------------------------===//
 
@@ -22,8 +23,7 @@ int main() {
   inc0 = (intptr_t)p;
   --(*p--); //--(*p--);
   inc1 = (intptr_t)p;
-  diff =  inc0-inc1;
+  diff = inc0 - inc1;
   diff += 2;
   return diff;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test086.c b/hpvm/projects/llvm-cbe/test/test086.c
index 72e7f03901df7570e5e134c4707c20e8fada74a5..32e33e992378733e721082dc94c339b64bc1cd81 100644
--- a/hpvm/projects/llvm-cbe/test/test086.c
+++ b/hpvm/projects/llvm-cbe/test/test086.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly increment a pointer via char.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly increment a pointer via
+// char. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //
 //===---------------------------------------------------------------------------===//
 
@@ -24,5 +25,5 @@ int main() {
   // diff =  inc1-inc0;
   // diff += 2;
   // return diff;
-  return 6; //TODO
+  return 6; // TODO
 }
diff --git a/hpvm/projects/llvm-cbe/test/test087.c b/hpvm/projects/llvm-cbe/test/test087.c
index 29291167906a5cb9fd3aedaa0d3523eaa54d5bbd..6c983a65d62b9a71c9c2be11a8107e734628f999 100644
--- a/hpvm/projects/llvm-cbe/test/test087.c
+++ b/hpvm/projects/llvm-cbe/test/test087.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +8,9 @@
 //
 //===---------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will properly decrement a pointer via char.
-// This example works by subtracting two mem. addresses and adding 2 to return 6.
-// *TW
+// This code tests to see that the CBE will properly decrement a pointer via
+// char. This example works by subtracting two mem. addresses and adding 2 to
+// return 6. *TW
 //===---------------------------------------------------------------------------===//
 
 int main() {
@@ -23,5 +24,5 @@ int main() {
   // diff =  inc0-inc1;
   // diff += 2;
   // return diff;
-  return 6; //TODO
+  return 6; // TODO
 }
diff --git a/hpvm/projects/llvm-cbe/test/test088.c b/hpvm/projects/llvm-cbe/test/test088.c
index 938237bea9774b7c9e52b36ae20d814bb563507c..7cefca1537290d57c2adce83321b848ca82fcbe3 100644
--- a/hpvm/projects/llvm-cbe/test/test088.c
+++ b/hpvm/projects/llvm-cbe/test/test088.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C -------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//-------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,14 +13,14 @@
 // *TW
 //===---------------------------------------------------------------------------===//
 
-int main(){
-   int a[2][2];
-   int *ip;
-   a[0][0] = 0;
-   a[0][1] = 1;
-   a[1][0] = 3;
-   a[1][1] = 6;
-   ip = &a[1][1];
+int main() {
+  int a[2][2];
+  int *ip;
+  a[0][0] = 0;
+  a[0][1] = 1;
+  a[1][0] = 3;
+  a[1][1] = 6;
+  ip = &a[1][1];
 
-   return *ip;
+  return *ip;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test089.c b/hpvm/projects/llvm-cbe/test/test089.c
index 925c3bb56ba77bb395641197dd4b5cef231d369e..59b20d5b45ba6c4d4d0cd70e04d0fd99d0253964 100644
--- a/hpvm/projects/llvm-cbe/test/test089.c
+++ b/hpvm/projects/llvm-cbe/test/test089.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,20 +8,20 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute data-packing in a structure correctly.
-// *TW
+// This code tests to see that the CBE will execute data-packing in a structure
+// correctly. *TW
 //===------------------------------------------------------------------------------===//
 
 #pragma pack(push)
 #pragma pack(1)
 
-struct DataSize{
-    char Data2;
-    char Data3;
-    int Data1;
+struct DataSize {
+  char Data2;
+  char Data3;
+  int Data1;
 };
 
-int main(){
-    struct DataSize example;
-    return sizeof(example);
+int main() {
+  struct DataSize example;
+  return sizeof(example);
 }
diff --git a/hpvm/projects/llvm-cbe/test/test090.c b/hpvm/projects/llvm-cbe/test/test090.c
index 021a05e8a002bcf2320df59c7e39c2963e52c756..d3e64ff5b9b21a68147c0a0aab69d74d05fc93e4 100644
--- a/hpvm/projects/llvm-cbe/test/test090.c
+++ b/hpvm/projects/llvm-cbe/test/test090.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,19 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a union and check the data size correctly.
-// *TW
+// This code tests to see that the CBE will execute a union and check the data
+// size correctly. *TW
 //===------------------------------------------------------------------------------===//
 
-union Data{
-   int i;
-   float f;
-   char  str[8];
+union Data {
+  int i;
+  float f;
+  char str[8];
 };
 
-int main(){
-   union Data data;
-   int datasize = sizeof(data) - 2;
+int main() {
+  union Data data;
+  int datasize = sizeof(data) - 2;
 
-   return datasize;
+  return datasize;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test091.c b/hpvm/projects/llvm-cbe/test/test091.c
index dce59d85d5b788696deb7e0b4e0a97e69cdea0e8..557286e1ddd2326f912d4ae788218fce536a0a25 100644
--- a/hpvm/projects/llvm-cbe/test/test091.c
+++ b/hpvm/projects/llvm-cbe/test/test091.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +8,17 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will access and return union members correctly.
-// *TW
+// This code tests to see that the CBE will access and return union members
+// correctly. *TW
 //===------------------------------------------------------------------------------===//
 
-union Data{
-   char unit1[6];
-   char unit2;
-   char unit3;
+union Data {
+  char unit1[6];
+  char unit2;
+  char unit3;
 };
 
-int main(){
-   union Data data;
-   return sizeof(data);
+int main() {
+  union Data data;
+  return sizeof(data);
 }
-
-
diff --git a/hpvm/projects/llvm-cbe/test/test092.c b/hpvm/projects/llvm-cbe/test/test092.c
index 3b197f21a5f8964daf0ac427955df96faa9feec2..8018bca7eecd1b4196f822bc354108e6b5e8dc27 100644
--- a/hpvm/projects/llvm-cbe/test/test092.c
+++ b/hpvm/projects/llvm-cbe/test/test092.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,29 +8,25 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will pass a structure into a function correctly.
-// *TW
+// This code tests to see that the CBE will pass a structure into a function
+// correctly. *TW
 //===------------------------------------------------------------------------------===//
 
 int k = 0;
 
-struct test{
-   int i;
-   float f;
+struct test {
+  int i;
+  float f;
 };
 
-void funct(struct test example){
-   k = example.i;
-}
+void funct(struct test example) { k = example.i; }
 
-int main(){
-   struct test example;
+int main() {
+  struct test example;
 
-   example.i = 6;
-   example.f = 6.0;
-   funct(example);
+  example.i = 6;
+  example.f = 6.0;
+  funct(example);
 
-   return k;
+  return k;
 }
-
-
diff --git a/hpvm/projects/llvm-cbe/test/test093.c b/hpvm/projects/llvm-cbe/test/test093.c
index 3553edea3a5fdb8680feaf2297ab32938ca2c608..9a6188e7d4d13b8e5b73a2e0cf832cb3ddb0f0ba 100644
--- a/hpvm/projects/llvm-cbe/test/test093.c
+++ b/hpvm/projects/llvm-cbe/test/test093.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,19 +12,19 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-struct layer1{
-   int depth1;
-   char name1[20];
+struct layer1 {
+  int depth1;
+  char name1[20];
 };
 
-struct layer2{
-   int depth2;
-   char name2[20];
-   struct layer1 layer_data;
-}layer2_data;
+struct layer2 {
+  int depth2;
+  char name2[20];
+  struct layer1 layer_data;
+} layer2_data;
 
-int main(){
-   struct layer2 layer2_data = {1, "test", {6, "test2"}};
+int main() {
+  struct layer2 layer2_data = {1, "test", {6, "test2"}};
 
-   return layer2_data.layer_data.depth1;
+  return layer2_data.layer_data.depth1;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test094.c b/hpvm/projects/llvm-cbe/test/test094.c
index 2568c9c3537d9cedce0cb36e86a414c068493504..8faf3330cc9f360debb2434f2720cfead79be20e 100644
--- a/hpvm/projects/llvm-cbe/test/test094.c
+++ b/hpvm/projects/llvm-cbe/test/test094.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +13,17 @@
 //===------------------------------------------------------------------------------===//
 
 typedef struct test {
- int var1;
- int var2;
- int var3;
-}testrename;
+  int var1;
+  int var2;
+  int var3;
+} testrename;
 
-int main(){
-    testrename variable;
+int main() {
+  testrename variable;
 
-    variable.var2 = 5;
-    variable.var3 = 6;
-    variable.var1 = 9;
+  variable.var2 = 5;
+  variable.var3 = 6;
+  variable.var1 = 9;
 
-    return variable.var3;
+  return variable.var3;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test095.c b/hpvm/projects/llvm-cbe/test/test095.c
index 21db27203416db2f9454ce203eed555299465a40..b622c4b94c071548e734f4dd4ceec7097b5b90a2 100644
--- a/hpvm/projects/llvm-cbe/test/test095.c
+++ b/hpvm/projects/llvm-cbe/test/test095.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +12,16 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-struct Shows
-    {
-     char show[20];
-     int runlength;
-     int rating;
+struct Shows {
+  char show[20];
+  int runlength;
+  int rating;
 };
 
-int main(){
-struct Shows b1[3] = {
-        {"Big Bang Theory",22,6},
-        {"NCIS",45,9},
-    };
-    return b1[0].rating;
+int main() {
+  struct Shows b1[3] = {
+      {"Big Bang Theory", 22, 6},
+      {"NCIS", 45, 9},
+  };
+  return b1[0].rating;
 }
diff --git a/hpvm/projects/llvm-cbe/test/test096.c b/hpvm/projects/llvm-cbe/test/test096.c
index 81661df1212b75da06f9eabcc9e64c82118172ad..35982e134131b895bcf15cbb22bda76e482d1e0b 100644
--- a/hpvm/projects/llvm-cbe/test/test096.c
+++ b/hpvm/projects/llvm-cbe/test/test096.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,18 +8,18 @@
 //
 //===------------------------------------------------------------------------------===//
 //
-// This code tests to see that the CBE will execute a self referencing structure.
-// *TW
+// This code tests to see that the CBE will execute a self referencing
+// structure. *TW
 //===------------------------------------------------------------------------------===//
 #include <stdio.h> //for NULL
 
-struct data{
-   int a;
-   struct data *ptr;
+struct data {
+  int a;
+  struct data *ptr;
 };
 
-int main(){
-   struct data p=(struct data){.a=3,.ptr=&(struct data){.a=6,.ptr=NULL}};
-   return p.ptr->a;
+int main() {
+  struct data p =
+      (struct data){.a = 3, .ptr = &(struct data){.a = 6, .ptr = NULL}};
+  return p.ptr->a;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test097.c b/hpvm/projects/llvm-cbe/test/test097.c
index a42e36b6cb43551113d7c38984895a07a479ffc4..6e0f8145b0909b0c6c6b3f26e09633b8ccc58b12 100644
--- a/hpvm/projects/llvm-cbe/test/test097.c
+++ b/hpvm/projects/llvm-cbe/test/test097.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +12,16 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int addby2 ( int x );
+int addby2(int x);
 
-int main( ){
-    int n ;
-    n = addby2 ( 4 ) ;
-    return n;
+int main() {
+  int n;
+  n = addby2(4);
+  return n;
 }
 
-int addby2(int x){
-    int p ;
-    p = x + 2 ;
-    return ( p ) ;
+int addby2(int x) {
+  int p;
+  p = x + 2;
+  return (p);
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test098.c b/hpvm/projects/llvm-cbe/test/test098.c
index 70de117e51a9064e638354fe78072e93a635c904..d8594b5a7615b6be6fcc0cb7a04b9e5ff972acd3 100644
--- a/hpvm/projects/llvm-cbe/test/test098.c
+++ b/hpvm/projects/llvm-cbe/test/test098.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,18 +12,18 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int subtrby2 ( int x );
+int subtrby2(int x);
 static int eight = 8;
 static int two = 2;
 
-int main( ){
-    int n ;
-    n = subtrby2 ( eight ) ;
-    return n;
+int main() {
+  int n;
+  n = subtrby2(eight);
+  return n;
 }
 
-int subtrby2(int x){
-    int p ;
-    p = x - two ;
-    return ( p ) ;
+int subtrby2(int x) {
+  int p;
+  p = x - two;
+  return (p);
 }
diff --git a/hpvm/projects/llvm-cbe/test/test099.c b/hpvm/projects/llvm-cbe/test/test099.c
index 1c4713262eeaf6042f7af0ee7e6e41547f226bb2..c4ab77522b27cdf29251409e707bb6548891e9a7 100644
--- a/hpvm/projects/llvm-cbe/test/test099.c
+++ b/hpvm/projects/llvm-cbe/test/test099.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +13,8 @@
 //===------------------------------------------------------------------------------===//
 
 int main() {
-    register int counter = 0;
-    counter += 6;
+  register int counter = 0;
+  counter += 6;
 
-    return 6;
+  return 6;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test100.c b/hpvm/projects/llvm-cbe/test/test100.c
index db2cd9ea604e3a5aa1b64eaf4159ae9f1fe2700c..2b6a07912d94388827c9cde38e997ca96249b269 100644
--- a/hpvm/projects/llvm-cbe/test/test100.c
+++ b/hpvm/projects/llvm-cbe/test/test100.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,20 +12,19 @@
 // *TW
 //===------------------------------------------------------------------------------===//
 
-int fibonaci(int i){
-   if(i == 0){
-      return 0;
-   }
-   if(i == 1){
-      return 1;
-   }
-   return fibonaci(i-1) + fibonaci(i-2);
+int fibonaci(int i) {
+  if (i == 0) {
+    return 0;
+  }
+  if (i == 1) {
+    return 1;
+  }
+  return fibonaci(i - 1) + fibonaci(i - 2);
 }
 
-int  main(){
-    int returnval;
-    returnval = fibonaci(6) - 2;
+int main() {
+  int returnval;
+  returnval = fibonaci(6) - 2;
 
-    return returnval;
+  return returnval;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test101.c b/hpvm/projects/llvm-cbe/test/test101.c
index 50d18d3ec33746d58a24cf342247e717d926d31a..ffffeb592072391026a4b0c3a705e8c63db235fd 100644
--- a/hpvm/projects/llvm-cbe/test/test101.c
+++ b/hpvm/projects/llvm-cbe/test/test101.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,24 +15,26 @@
 
 unsigned int fastfib(unsigned int n);
 
-int main(){
-    return fastfib(6) - 2;
-}
+int main() { return fastfib(6) - 2; }
 
-unsigned int fastfib(unsigned int n){
-    unsigned int a[3];
-    unsigned int *p=a;
-    unsigned int i;
+unsigned int fastfib(unsigned int n) {
+  unsigned int a[3];
+  unsigned int *p = a;
+  unsigned int i;
 
-    for(i=0; i<=n; ++i) {
-        if(i<2) *p=i;
-        else{
-            if(p==a) *p=*(a+1)+*(a+2);
-            else if(p==a+1) *p=*a+*(a+2);
-            else *p=*a+*(a+1);
-        }
-        if(++p>a+2) p=a;
+  for (i = 0; i <= n; ++i) {
+    if (i < 2)
+      *p = i;
+    else {
+      if (p == a)
+        *p = *(a + 1) + *(a + 2);
+      else if (p == a + 1)
+        *p = *a + *(a + 2);
+      else
+        *p = *a + *(a + 1);
     }
-    return p==a?*(p+2):*(p-1);
+    if (++p > a + 2)
+      p = a;
+  }
+  return p == a ? *(p + 2) : *(p - 1);
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test102.c b/hpvm/projects/llvm-cbe/test/test102.c
index 572ea0310334592c668e6266da7c364d39a80ebb..44247c6231a26acfca041e6896bcbb300d2bc6f5 100644
--- a/hpvm/projects/llvm-cbe/test/test102.c
+++ b/hpvm/projects/llvm-cbe/test/test102.c
@@ -1,4 +1,5 @@
-//===-- CBackend.cpp - Library for converting LLVM code to C ----------------------===//
+//===-- CBackend.cpp - Library for converting LLVM code to C
+//----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/hpvm/projects/llvm-cbe/test/test103.c b/hpvm/projects/llvm-cbe/test/test103.c
index 6e2329021d257f46fb2b818e68f932e90899b8d8..e751c2d8a4e3c2249921b15c833ae0e99a47d10a 100644
--- a/hpvm/projects/llvm-cbe/test/test103.c
+++ b/hpvm/projects/llvm-cbe/test/test103.c
@@ -15,9 +15,8 @@
 #define B 3
 #define C A + B
 
-int main(){
+int main() {
 
-   int x = C;
-   return x;
+  int x = C;
+  return x;
 }
-
diff --git a/hpvm/projects/llvm-cbe/test/test104.c b/hpvm/projects/llvm-cbe/test/test104.c
index 88884d68575f413784f039a1685430c8e1dce56e..43c29dedb685484fd779d0565eaf3d30f97c160a 100644
--- a/hpvm/projects/llvm-cbe/test/test104.c
+++ b/hpvm/projects/llvm-cbe/test/test104.c
@@ -12,13 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-int tail (int n) {
+int tail(int n) {
   if (n == 6)
     return n;
   else
-    return tail(n+1);
+    return tail(n + 1);
 }
 
-int main(){
-  return tail(0);
-}
+int main() { return tail(0); }
diff --git a/hpvm/projects/llvm-cbe/test/test105.c b/hpvm/projects/llvm-cbe/test/test105.c
index 7e830d55c55182e5d995a8841c41132555c54ee4..79ab340aef5c7db27c06d076efa95bb85fb5a964 100644
--- a/hpvm/projects/llvm-cbe/test/test105.c
+++ b/hpvm/projects/llvm-cbe/test/test105.c
@@ -12,13 +12,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-int head(int n){
-  if(n == 6)
+int head(int n) {
+  if (n == 6)
     return n;
   else
-    return head(n+1);
+    return head(n + 1);
 }
 
-int main(){
-  return head(0);
-}
+int main() { return head(0); }
diff --git a/hpvm/projects/llvm-cbe/test/testbad.c b/hpvm/projects/llvm-cbe/test/testbad.c
index a7456dc2b52888358ae2e7fce0da5b0c799c9b45..a8a9bca17c49e5ebed010beeeb987ac5904b3b10 100644
--- a/hpvm/projects/llvm-cbe/test/testbad.c
+++ b/hpvm/projects/llvm-cbe/test/testbad.c
@@ -11,7 +11,4 @@
 //
 //===----------------------------------------------------------------------===//
 
-int main()
-{
-    return 25;
-}
+int main() { return 25; }
diff --git a/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp b/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp
index 6c4750182516934ae627d7b25baa8e8e98daa6ba..87a67a7364aa8b5132d7e82fd9c4e1005d3ce6fa 100644
--- a/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp
+++ b/hpvm/projects/llvm-cbe/tools/llvm-cbe/llvm-cbe.cpp
@@ -192,8 +192,8 @@ int main(int argc, char **argv) {
   initializeLowerIntrinsicsPass(*Registry);
   initializeUnreachableBlockElimLegacyPassPass(*Registry);
 
-	// Adding necessary passes for loop generation
-	initializeLoopInfoWrapperPassPass(*Registry);
+  // Adding necessary passes for loop generation
+  initializeLoopInfoWrapperPassPass(*Registry);
   initializePostDominatorTreeWrapperPassPass(*Registry);
   initializeScalarEvolutionWrapperPassPass(*Registry);
   initializeDominatorTreeWrapperPassPass(*Registry);
diff --git a/hpvm/projects/visc-rt/device_abstraction.h b/hpvm/projects/visc-rt/device_abstraction.h
index 68748c7ab73d316c7bf296e67d88c0114b4cac81..7e77d100deb6b23b6ed9ca994796cd1cb108b0d4 100644
--- a/hpvm/projects/visc-rt/device_abstraction.h
+++ b/hpvm/projects/visc-rt/device_abstraction.h
@@ -1,14 +1,13 @@
 #ifndef __DEVICE_ABSTRACTION__
 #define __DEVICE_ABSTRACTION__
 
+#include <fstream>
+#include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
-#include <time.h>
-#include <time.h>
 #include <thread>
+#include <time.h>
 #include <vector>
-#include <iostream>
-#include <fstream>
 
 #define MIN_INTERVAL 2
 #define MAX_INTERVAL 8
@@ -23,14 +22,13 @@ std::vector<unsigned> Intervals;
 // simulation
 volatile bool executionEnd = false;
 
-
 void initializeDeviceStatusIntervals() {
 
   unsigned sz = 0;
   unsigned tmp = 0;
 
-  const char *fn =
-    "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/deviceStatusSwitchIntervals.txt";
+  const char *fn = "/home/kotsifa2/HPVM/hpvm/build/projects/visc-rt/"
+                   "deviceStatusSwitchIntervals.txt";
   std::ifstream infile;
   infile.open(fn);
   if (!infile.is_open()) {
@@ -55,10 +53,11 @@ void initializeDeviceStatusIntervals() {
       std::cout << "Failed to open " << fn << " for writing\n";
       return;
     }
-    sz = 1 + rand()%NUM_INTERVALS;
-    outfile << sz; 
+    sz = 1 + rand() % NUM_INTERVALS;
+    outfile << sz;
     for (unsigned i = 0; i < sz; i++) {
-      Intervals.push_back(MIN_INTERVAL + rand()%(MAX_INTERVAL - MIN_INTERVAL));
+      Intervals.push_back(MIN_INTERVAL +
+                          rand() % (MAX_INTERVAL - MIN_INTERVAL));
       outfile << Intervals[i];
     }
     outfile.close();
@@ -71,12 +70,11 @@ void updateDeviceStatus() {
 
   unsigned i = 0;
   while (!executionEnd) {
-    std::this_thread::sleep_for (std::chrono::seconds(Intervals[i]));
+    std::this_thread::sleep_for(std::chrono::seconds(Intervals[i]));
     deviceStatus = !deviceStatus;
     std::cout << "Changed device status to " << deviceStatus << "\n";
-    i = (i+1) % Intervals.size();
+    i = (i + 1) % Intervals.size();
   }
-
 }
 
 #endif // __DEVICE_ABSTRACTION__
diff --git a/hpvm/projects/visc-rt/policy.h b/hpvm/projects/visc-rt/policy.h
index 4bd6fa046967a7a1632e89941b155695ee139718..d50e65868b376bfbcc3d4bd00d4919db677722b8 100644
--- a/hpvm/projects/visc-rt/policy.h
+++ b/hpvm/projects/visc-rt/policy.h
@@ -1,23 +1,21 @@
 #ifndef __POLICY__
 #define __POLICY__
 
-#include <string>
 #include "device_abstraction.h"
+#include <string>
 
- /************************* Policies *************************************/
+/************************* Policies *************************************/
 class Policy {
-  public:
-    virtual int getVersion(const char *, int64_t) = 0;
-    virtual ~Policy() {};
+public:
+  virtual int getVersion(const char *, int64_t) = 0;
+  virtual ~Policy(){};
 };
 
 class ConstPolicy : public Policy {
 public:
-  ConstPolicy(int deviceID): deviceID(deviceID) {}
+  ConstPolicy(int deviceID) : deviceID(deviceID) {}
 
-  int getVersion(const char *, int64_t) override {
-    return deviceID;
-  }
+  int getVersion(const char *, int64_t) override { return deviceID; }
 
 private:
   int deviceID;
@@ -26,16 +24,17 @@ private:
 class NodePolicy : public Policy {
   virtual int getVersion(const char *name, int64_t it) override {
     std::string s(name);
-    //std::string NodeNames[1] = { "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
+    // std::string NodeNames[1] = {
+    // "_Z9mysgemmNTPfiS_iS_iiff_clonedInternal_level2_cloned" };
     std::string NodeNames[] = {
-      "WrapperGaussianSmoothing_cloned",
-      "WrapperlaplacianEstimate_cloned",
-      "WrapperComputeZeroCrossings_cloned",
-      "WrapperComputeGradient_cloned",
-      "WrapperComputeMaxGradient_cloned",
-      "WrapperRejectZeroCrossings_cloned",
+        "WrapperGaussianSmoothing_cloned",
+        "WrapperlaplacianEstimate_cloned",
+        "WrapperComputeZeroCrossings_cloned",
+        "WrapperComputeGradient_cloned",
+        "WrapperComputeMaxGradient_cloned",
+        "WrapperRejectZeroCrossings_cloned",
     };
-    //if (!s.compare(NodeNames[4])) {
+    // if (!s.compare(NodeNames[4])) {
     //  std::cout << s << ": CPU" << "\n";
     //  return 0;
     //}
@@ -55,11 +54,10 @@ class IterationPolicy : public Policy {
 class DeviceStatusPolicy : public Policy {
   virtual int getVersion(const char *name, int64_t it) override {
     if (deviceStatus) {
-      //std::cout << "Returning GPU\n";
+      // std::cout << "Returning GPU\n";
       return 2;
-    }
-    else {
-      //std::cout << "Returning CPU\n";
+    } else {
+      // std::cout << "Returning CPU\n";
       return 0;
     }
   }
@@ -98,12 +96,12 @@ public:
     userTargetDeviceChoice = 1;
     end = false;
     userTargetDeviceChoiceThread =
-      std::thread(&InteractivePolicy::updateUserTargetChoice, this);
+        std::thread(&InteractivePolicy::updateUserTargetChoice, this);
   }
 
   ~InteractivePolicy() {
     end = true;
-    userTargetDeviceChoiceThread.join(); 
+    userTargetDeviceChoiceThread.join();
   }
 };
 
diff --git a/hpvm/projects/visc-rt/visc-rt.cpp b/hpvm/projects/visc-rt/visc-rt.cpp
index eff618548f3405b668249791738015043a537f17..53d3b516f22b59857b1a17aecba32a6b723998f0 100644
--- a/hpvm/projects/visc-rt/visc-rt.cpp
+++ b/hpvm/projects/visc-rt/visc-rt.cpp
@@ -1,43 +1,44 @@
-#include <iostream>
-#include <string>
-#include <pthread.h>
-#include <cstdlib>
+#include <CL/cl.h>
+#include <cassert>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
-#include <cassert>
+#include <iostream>
 #include <map>
-#include <CL/cl.h>
+#include <pthread.h>
+#include <string>
 
 #include <unistd.h>
 
 #if _POSIX_VERSION >= 200112L
-# include <sys/time.h>
+#include <sys/time.h>
 #endif
 #include "visc-rt.h"
 
 #ifndef DEBUG_BUILD
-#define DEBUG(s) {}
+#define DEBUG(s)                                                               \
+  {}
 #else
 #define DEBUG(s) s
 #endif
 
-#define BILLION   1000000000LL
+#define BILLION 1000000000LL
 
 using namespace std;
 
 typedef struct {
   pthread_t threadID;
-  std::vector<pthread_t>* threads;
-  // Map from InputPort to Size 
-  std::map<unsigned, uint64_t>* ArgInPortSizeMap;
-  //std::vector<uint64_t>* BindInSizes;
-  std::vector<unsigned>* BindInSourcePort;
-  std::vector<uint64_t>* BindOutSizes;
-  std::vector<uint64_t>* EdgeSizes;
-  std::vector<CircularBuffer<uint64_t>*>* BindInputBuffers;
-  std::vector<CircularBuffer<uint64_t>*>* BindOutputBuffers;
-  std::vector<CircularBuffer<uint64_t>*>* EdgeBuffers;
-  std::vector<CircularBuffer<uint64_t>*>* isLastInputBuffers;
+  std::vector<pthread_t> *threads;
+  // Map from InputPort to Size
+  std::map<unsigned, uint64_t> *ArgInPortSizeMap;
+  // std::vector<uint64_t>* BindInSizes;
+  std::vector<unsigned> *BindInSourcePort;
+  std::vector<uint64_t> *BindOutSizes;
+  std::vector<uint64_t> *EdgeSizes;
+  std::vector<CircularBuffer<uint64_t> *> *BindInputBuffers;
+  std::vector<CircularBuffer<uint64_t> *> *BindOutputBuffers;
+  std::vector<CircularBuffer<uint64_t> *> *EdgeBuffers;
+  std::vector<CircularBuffer<uint64_t> *> *isLastInputBuffers;
 } DFNodeContext_X86;
 
 typedef struct {
@@ -48,7 +49,7 @@ typedef struct {
 } DFNodeContext_OCL;
 
 cl_context globalOCLContext;
-cl_device_id* clDevices;
+cl_device_id *clDevices;
 cl_command_queue globalCommandQue;
 
 Policy *policy = NULL;
@@ -60,10 +61,10 @@ pthread_mutex_t ocl_mtx;
 #define NUM_TESTS 1
 visc_TimerSet kernel_timer;
 
-static inline void checkErr(cl_int err, cl_int success, const char * name) {
+static inline void checkErr(cl_int err, cl_int success, const char *name) {
   if (err != success) {
-  cout << "ERROR: " << name << flush << "\n";
-  cout << "ErrorCode: " << err << flush << "\n";
+    cout << "ERROR: " << name << flush << "\n";
+    cout << "ErrorCode: " << err << flush << "\n";
     exit(EXIT_FAILURE);
   }
 }
@@ -71,16 +72,17 @@ static inline void checkErr(cl_int err, cl_int success, const char * name) {
 /************************* Policies *************************************/
 void llvm_visc_policy_init() {
   cout << "Initializing policy object ...\n";
-//  policy = new NodePolicy();
-//  policy = new IterationPolicy();
-//  policy = new DeviceStatusPolicy();
+  //  policy = new NodePolicy();
+  //  policy = new IterationPolicy();
+  //  policy = new DeviceStatusPolicy();
   // policy = new InteractivePolicy();
   policy = new ConstPolicy(0);
   cout << "DONE: Initializing policy object.\n";
 }
 
 void llvm_visc_policy_clear() {
-  if (policy) free(policy);
+  if (policy)
+    free(policy);
 }
 
 int llvm_visc_policy_getVersion(const char *name, int64_t i) {
@@ -111,58 +113,65 @@ void llvm_visc_deviceAbstraction_end() {
 }
 
 void llvm_visc_deviceAbstraction_waitOnDeviceStatus() {
-  while (!deviceStatus) { };
+  while (!deviceStatus) {
+  };
   return;
 }
 
 /************************* Depth Stack Routines ***************************/
 
-void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX, uint64_t limitY,
-    uint64_t iY, uint64_t limitZ, uint64_t iZ) {
-    DEBUG(cout << "Pushing node information on stack:\n");
-    DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY << ", "<< limitZ <<")\n");
-    DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", "<< iZ <<")\n");
-    DFGDepth nodeInfo (n, limitX, iX, limitY, iY, limitZ, iZ);
-    pthread_mutex_lock(&ocl_mtx);
-    DStack.push_back(nodeInfo);
-    DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
-    pthread_mutex_unlock(&ocl_mtx);
+void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX, uint64_t iX,
+                               uint64_t limitY, uint64_t iY, uint64_t limitZ,
+                               uint64_t iZ) {
+  DEBUG(cout << "Pushing node information on stack:\n");
+  DEBUG(cout << "\tNumDim = " << n << "\t Limit(" << limitX << ", " << limitY
+             << ", " << limitZ << ")\n");
+  DEBUG(cout << "\tInstance(" << iX << ", " << iY << ", " << iZ << ")\n");
+  DFGDepth nodeInfo(n, limitX, iX, limitY, iY, limitZ, iZ);
+  pthread_mutex_lock(&ocl_mtx);
+  DStack.push_back(nodeInfo);
+  DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
+  pthread_mutex_unlock(&ocl_mtx);
 }
 
 void llvm_visc_x86_dstack_pop() {
-    DEBUG(cout << "Popping from depth stack\n");
-    pthread_mutex_lock(&ocl_mtx);
-    DStack.pop_back();
-    DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
-    pthread_mutex_unlock(&ocl_mtx);
+  DEBUG(cout << "Popping from depth stack\n");
+  pthread_mutex_lock(&ocl_mtx);
+  DStack.pop_back();
+  DEBUG(cout << "DStack size = " << DStack.size() << flush << "\n");
+  pthread_mutex_unlock(&ocl_mtx);
 }
 
 uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim) {
-   DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level <<flush << "\n");
-   pthread_mutex_lock(&ocl_mtx);
-   unsigned size = DStack.size();
-   DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimLimit(dim) <<flush << "\n");
-   uint64_t result = DStack[size-level-1].getDimLimit(dim);
-   pthread_mutex_unlock(&ocl_mtx);
-   return result;
+  DEBUG(cout << "Request limit for dim " << dim << " of ancestor " << level
+             << flush << "\n");
+  pthread_mutex_lock(&ocl_mtx);
+  unsigned size = DStack.size();
+  DEBUG(cout << "\t Return: " << DStack[size - level - 1].getDimLimit(dim)
+             << flush << "\n");
+  uint64_t result = DStack[size - level - 1].getDimLimit(dim);
+  pthread_mutex_unlock(&ocl_mtx);
+  return result;
 }
 
 uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim) {
-    DEBUG(cout << "Request instance id for dim " << dim << " of ancestor " << level <<flush << "\n");
-    pthread_mutex_lock(&ocl_mtx);
-    unsigned size = DStack.size();
-    DEBUG(cout << "\t Return: " << DStack[size-level-1].getDimInstance(dim) <<flush << "\n");
-    uint64_t result = DStack[size-level-1].getDimInstance(dim);
-    pthread_mutex_unlock(&ocl_mtx);
-    return result;
+  DEBUG(cout << "Request instance id for dim " << dim << " of ancestor "
+             << level << flush << "\n");
+  pthread_mutex_lock(&ocl_mtx);
+  unsigned size = DStack.size();
+  DEBUG(cout << "\t Return: " << DStack[size - level - 1].getDimInstance(dim)
+             << flush << "\n");
+  uint64_t result = DStack[size - level - 1].getDimInstance(dim);
+  pthread_mutex_unlock(&ocl_mtx);
+  return result;
 }
 
 /********************** Memory Tracking Routines **************************/
 
-void llvm_visc_track_mem(void* ptr, size_t size) {
+void llvm_visc_track_mem(void *ptr, size_t size) {
   DEBUG(cout << "Start tracking memory: " << ptr << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
-  if(MTE != NULL) {
+  MemTrackerEntry *MTE = MTracker.lookup(ptr);
+  if (MTE != NULL) {
     DEBUG(cout << "ID " << ptr << " already present in the MemTracker Table\n");
     return;
   }
@@ -171,25 +180,28 @@ void llvm_visc_track_mem(void* ptr, size_t size) {
   DEBUG(MTracker.print());
 }
 
-void llvm_visc_untrack_mem(void* ptr) {
+void llvm_visc_untrack_mem(void *ptr) {
   DEBUG(cout << "Stop tracking memory: " << ptr << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
-  if(MTE == NULL) {
-    cout << "WARNING: Trying to remove ID " << ptr << " not present in the MemTracker Table\n";
+  MemTrackerEntry *MTE = MTracker.lookup(ptr);
+  if (MTE == NULL) {
+    cout << "WARNING: Trying to remove ID " << ptr
+         << " not present in the MemTracker Table\n";
     return;
   }
   DEBUG(cout << "Removing ID " << ptr << " from MemTracker Table\n");
-  if(MTE->getLocation() == MemTrackerEntry::DEVICE)
-    clReleaseMemObject((cl_mem) MTE->getAddress());
+  if (MTE->getLocation() == MemTrackerEntry::DEVICE)
+    clReleaseMemObject((cl_mem)MTE->getAddress());
   MTracker.remove(ptr);
   DEBUG(MTracker.print());
 }
 
-
-static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL* Context, bool isInput, bool isOutput) {
+static void *llvm_visc_ocl_request_mem(void *ptr, size_t size,
+                                       DFNodeContext_OCL *Context, bool isInput,
+                                       bool isOutput) {
   pthread_mutex_lock(&ocl_mtx);
-  DEBUG(cout << "[OCL] Request memory: " << ptr << " for context: " << Context->clOCLContext << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
+  DEBUG(cout << "[OCL] Request memory: " << ptr
+             << " for context: " << Context->clOCLContext << flush << "\n");
+  MemTrackerEntry *MTE = MTracker.lookup(ptr);
   if (MTE == NULL) {
     MTracker.print();
     cout << "ERROR: Requesting memory not present in Table\n";
@@ -197,89 +209,91 @@ static void* llvm_visc_ocl_request_mem(void* ptr, size_t size, DFNodeContext_OCL
   }
   // If already on device
   if (MTE->getLocation() == MemTrackerEntry::DEVICE &&
-      ((DFNodeContext_OCL*)MTE->getContext())->clOCLContext == Context->clOCLContext) {
-    DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n"); 
+      ((DFNodeContext_OCL *)MTE->getContext())->clOCLContext ==
+          Context->clOCLContext) {
+    DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush
+               << "\n");
     pthread_mutex_unlock(&ocl_mtx);
     return MTE->getAddress();
   }
-  
-  DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n");
+
+  DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush
+             << "\n");
   DEBUG(cout << "\t"; MTE->print(); cout << flush << "\n");
   // Else copy and update the latest copy
   cl_mem_flags clFlags;
   cl_int errcode;
 
-  if(isInput && isOutput) clFlags = CL_MEM_READ_WRITE;
-  else if(isInput)        clFlags = CL_MEM_READ_ONLY;
-  else if(isOutput)       clFlags = CL_MEM_WRITE_ONLY;
-  else                    clFlags = CL_MEM_READ_ONLY;
+  if (isInput && isOutput)
+    clFlags = CL_MEM_READ_WRITE;
+  else if (isInput)
+    clFlags = CL_MEM_READ_ONLY;
+  else if (isOutput)
+    clFlags = CL_MEM_WRITE_ONLY;
+  else
+    clFlags = CL_MEM_READ_ONLY;
 
   visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_mem d_input = clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  cl_mem d_input =
+      clCreateBuffer(Context->clOCLContext, clFlags, size, NULL, &errcode);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to allocate memory on device");
-  DEBUG(cout<< "\nMemory allocated on device: " << d_input << flush << "\n");
-  if(isInput) {
+  DEBUG(cout << "\nMemory allocated on device: " << d_input << flush << "\n");
+  if (isInput) {
     DEBUG(cout << "\tCopying ...");
-    //pthread_mutex_lock(&ocl_mtx);
-    errcode = clEnqueueWriteBuffer(Context->clCommandQue,
-                                  d_input,
-                                  CL_TRUE,
-                                  0,
-                                  size,MTE->getAddress(),
-                                  0,NULL,NULL);
-    //pthread_mutex_unlock(&ocl_mtx);
+    // pthread_mutex_lock(&ocl_mtx);
+    errcode = clEnqueueWriteBuffer(Context->clCommandQue, d_input, CL_TRUE, 0,
+                                   size, MTE->getAddress(), 0, NULL, NULL);
+    // pthread_mutex_unlock(&ocl_mtx);
     checkErr(errcode, CL_SUCCESS, "Failure to copy memory to device");
   }
 
   visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
   DEBUG(cout << " done\n");
-  MTE->update(MemTrackerEntry::DEVICE, (void*) d_input, Context);
+  MTE->update(MemTrackerEntry::DEVICE, (void *)d_input, Context);
   DEBUG(cout << "Updated Table\n");
   DEBUG(MTracker.print());
   pthread_mutex_unlock(&ocl_mtx);
   return d_input;
 }
 
-void* llvm_visc_x86_argument_ptr(void* ptr, size_t size) {
+void *llvm_visc_x86_argument_ptr(void *ptr, size_t size) {
   return llvm_visc_request_mem(ptr, size);
 }
 
-void* llvm_visc_request_mem(void* ptr, size_t size) {
+void *llvm_visc_request_mem(void *ptr, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "[X86] Request memory: " << ptr << flush << "\n");
-  MemTrackerEntry* MTE = MTracker.lookup(ptr);
-  if(MTE == NULL) {
+  MemTrackerEntry *MTE = MTracker.lookup(ptr);
+  if (MTE == NULL) {
     cout << "ERROR: Requesting memory not present in Table\n";
     pthread_mutex_unlock(&ocl_mtx);
     exit(EXIT_FAILURE);
   }
   // If already on host
-  if(MTE->getLocation() == MemTrackerEntry::HOST) {
-    DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush << "\n"); 
+  if (MTE->getLocation() == MemTrackerEntry::HOST) {
+    DEBUG(cout << "\tMemory found on host at: " << MTE->getAddress() << flush
+               << "\n");
     pthread_mutex_unlock(&ocl_mtx);
     return MTE->getAddress();
   }
 
   // Else copy from device and update table
-  DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush << "\n");
+  DEBUG(cout << "\tMemory found on device at: " << MTE->getAddress() << flush
+             << "\n");
   DEBUG(cout << "\tCopying ...");
   visc_SwitchToTimer(&kernel_timer, visc_TimerID_COPY);
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clEnqueueReadBuffer(((DFNodeContext_OCL*)MTE->getContext())->clCommandQue,
-                                      (cl_mem) MTE->getAddress(),
-                                      CL_TRUE,
-                                      0,
-                                      size,
-                                      ptr,
-                                      0, NULL, NULL);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clEnqueueReadBuffer(
+      ((DFNodeContext_OCL *)MTE->getContext())->clCommandQue,
+      (cl_mem)MTE->getAddress(), CL_TRUE, 0, size, ptr, 0, NULL, NULL);
+  // pthread_mutex_unlock(&ocl_mtx);
   visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
   DEBUG(cout << " done\n");
   checkErr(errcode, CL_SUCCESS, "[request mem] Failure to read output");
   DEBUG(cout << "Free mem object on device\n");
-  clReleaseMemObject((cl_mem) MTE->getAddress());
+  clReleaseMemObject((cl_mem)MTE->getAddress());
   DEBUG(cout << "Updated Table\n");
   MTE->update(MemTrackerEntry::HOST, ptr);
   DEBUG(MTracker.print());
@@ -289,63 +303,57 @@ void* llvm_visc_request_mem(void* ptr, size_t size) {
 
 /*************************** Timer Routines **********************************/
 
-static int is_async(enum visc_TimerID timer)
-{
-  return (timer == visc_TimerID_KERNEL) ||
-             (timer == visc_TimerID_COPY_ASYNC);
+static int is_async(enum visc_TimerID timer) {
+  return (timer == visc_TimerID_KERNEL) || (timer == visc_TimerID_COPY_ASYNC);
 }
 
-static int is_blocking(enum visc_TimerID timer)
-{
+static int is_blocking(enum visc_TimerID timer) {
   return (timer == visc_TimerID_COPY) || (timer == visc_TimerID_NONE);
 }
 
 #define INVALID_TIMERID visc_TimerID_LAST
 
-static int asyncs_outstanding(struct visc_TimerSet* timers)
-{
+static int asyncs_outstanding(struct visc_TimerSet *timers) {
   return (timers->async_markers != NULL) &&
-           (timers->async_markers->timerID != INVALID_TIMERID);
+         (timers->async_markers->timerID != INVALID_TIMERID);
 }
 
 static struct visc_async_time_marker_list *
-get_last_async(struct visc_TimerSet* timers)
-{
+get_last_async(struct visc_TimerSet *timers) {
   /* Find the last event recorded thus far */
-  struct visc_async_time_marker_list * last_event = timers->async_markers;
-  if(last_event != NULL && last_event->timerID != INVALID_TIMERID) {
-    while(last_event->next != NULL &&
-            last_event->next->timerID != INVALID_TIMERID)
+  struct visc_async_time_marker_list *last_event = timers->async_markers;
+  if (last_event != NULL && last_event->timerID != INVALID_TIMERID) {
+    while (last_event->next != NULL &&
+           last_event->next->timerID != INVALID_TIMERID)
       last_event = last_event->next;
     return last_event;
   } else
     return NULL;
 }
 
-static void insert_marker(struct visc_TimerSet* tset, enum visc_TimerID timer)
-{
+static void insert_marker(struct visc_TimerSet *tset, enum visc_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
+  struct visc_async_time_marker_list **new_event = &(tset->async_markers);
 
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
+  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
-  if(*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)
-      			malloc(sizeof(struct visc_async_time_marker_list));
+  if (*new_event == NULL) {
+    *new_event = (struct visc_async_time_marker_list *)malloc(
+        sizeof(struct visc_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
-    // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event
-#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
-fprintf(stderr, "Creating Marker [%d]\n", timer);
-    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Creating User Event Object!\n");
+    // I don't think this is needed at all. I believe clEnqueueMarker 'creates'
+the event #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) fprintf(stderr, "Creating
+Marker [%d]\n", timer);
+    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr,
+&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User
+Event Object!\n");
     }
-    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Setting User Event Status!\n");
+    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)),
+CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User
+Event Status!\n");
     }
 #endif
 */
@@ -355,38 +363,38 @@ fprintf(stderr, "Creating Marker [%d]\n", timer);
   /* valid event handle now aquired: insert the event record */
   (*new_event)->label = NULL;
   (*new_event)->timerID = timer;
-  //pthread_mutex_lock(&ocl_mtx);
-  ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  ciErrNum =
+      clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
+  // pthread_mutex_unlock(&ocl_mtx);
   if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Enqueueing Marker!\n");
+    fprintf(stderr, "Error Enqueueing Marker!\n");
   }
-
 }
 
-static void insert_submarker(struct visc_TimerSet* tset, char *label, enum visc_TimerID timer)
-{
+static void insert_submarker(struct visc_TimerSet *tset, char *label,
+                             enum visc_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct visc_async_time_marker_list ** new_event = &(tset->async_markers);
+  struct visc_async_time_marker_list **new_event = &(tset->async_markers);
 
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
+  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
-  if(*new_event == NULL) {
-    *new_event = (struct visc_async_time_marker_list *)
-      			malloc(sizeof(struct visc_async_time_marker_list));
+  if (*new_event == NULL) {
+    *new_event = (struct visc_async_time_marker_list *)malloc(
+        sizeof(struct visc_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
 #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
 fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
-    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Creating User Event Object!\n");
+    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr,
+&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User
+Event Object!\n");
     }
-    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Setting User Event Status!\n");
+    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)),
+CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User
+Event Status!\n");
     }
 #endif
 */
@@ -396,44 +404,49 @@ fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
   /* valid event handle now aquired: insert the event record */
   (*new_event)->label = label;
   (*new_event)->timerID = timer;
-  //pthread_mutex_lock(&ocl_mtx);
-  ciErrNum = clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  ciErrNum =
+      clEnqueueMarker(globalCommandQue, (cl_event *)(*new_event)->marker);
+  // pthread_mutex_unlock(&ocl_mtx);
   if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Enqueueing Marker!\n");
+    fprintf(stderr, "Error Enqueueing Marker!\n");
   }
-
 }
 
-
 /* Assumes that all recorded events have completed */
-static visc_Timestamp record_async_times(struct visc_TimerSet* tset)
-{
-  struct visc_async_time_marker_list * next_interval = NULL;
-  struct visc_async_time_marker_list * last_marker = get_last_async(tset);
+static visc_Timestamp record_async_times(struct visc_TimerSet *tset) {
+  struct visc_async_time_marker_list *next_interval = NULL;
+  struct visc_async_time_marker_list *last_marker = get_last_async(tset);
   visc_Timestamp total_async_time = 0;
 
-  for(next_interval = tset->async_markers; next_interval != last_marker;
-      next_interval = next_interval->next) {
-    cl_ulong command_start=0, command_end=0;
+  for (next_interval = tset->async_markers; next_interval != last_marker;
+       next_interval = next_interval->next) {
+    cl_ulong command_start = 0, command_end = 0;
     cl_int ciErrNum = CL_SUCCESS;
 
-    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL);
+    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker),
+                                       CL_PROFILING_COMMAND_END,
+                                       sizeof(cl_ulong), &command_start, NULL);
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum);
     }
 
-    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL);
+    ciErrNum = clGetEventProfilingInfo(
+        *((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END,
+        sizeof(cl_ulong), &command_end, NULL);
     if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum);
+      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n",
+              ciErrNum);
     }
 
-    visc_Timestamp interval = (visc_Timestamp) (((double)(command_end - command_start)));
+    visc_Timestamp interval =
+        (visc_Timestamp)(((double)(command_end - command_start)));
     tset->timers[next_interval->timerID].elapsed += interval;
     if (next_interval->label != NULL) {
-      struct visc_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list;
+      struct visc_SubTimer *subtimer =
+          tset->sub_timer_list[next_interval->timerID]->subtimer_list;
       while (subtimer != NULL) {
-        if ( strcmp(subtimer->label, next_interval->label) == 0) {
+        if (strcmp(subtimer->label, next_interval->label) == 0) {
           subtimer->timer.elapsed += interval;
           break;
         }
@@ -444,50 +457,42 @@ static visc_Timestamp record_async_times(struct visc_TimerSet* tset)
     next_interval->timerID = INVALID_TIMERID;
   }
 
-  if(next_interval != NULL)
+  if (next_interval != NULL)
     next_interval->timerID = INVALID_TIMERID;
 
   return total_async_time;
 }
 
-static void
-accumulate_time(visc_Timestamp *accum,
-		visc_Timestamp start,
-		visc_Timestamp end)
-{
+static void accumulate_time(visc_Timestamp *accum, visc_Timestamp start,
+                            visc_Timestamp end) {
 #if _POSIX_VERSION >= 200112L
   *accum += end - start;
 #else
-# error "Timestamps not implemented for this system"
+#error "Timestamps not implemented for this system"
 #endif
 }
 
 #if _POSIX_VERSION >= 200112L
-static visc_Timestamp get_time()
-{
+static visc_Timestamp get_time() {
   struct timespec tv;
   clock_gettime(CLOCK_MONOTONIC, &tv);
-  return (visc_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec);
+  return (visc_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec);
 }
 #else
-# error "no supported time libraries are available on this platform"
+#error "no supported time libraries are available on this platform"
 #endif
 
-void
-visc_ResetTimer(struct visc_Timer *timer)
-{
+void visc_ResetTimer(struct visc_Timer *timer) {
   timer->state = visc_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   timer->elapsed = 0;
 #else
-# error "visc_ResetTimer: not implemented for this system"
+#error "visc_ResetTimer: not implemented for this system"
 #endif
 }
 
-void
-visc_StartTimer(struct visc_Timer *timer)
-{
+void visc_StartTimer(struct visc_Timer *timer) {
   if (timer->state != visc_Timer_STOPPED) {
     // FIXME: Removing warning statement to avoid printing this error
     // fputs("Ignoring attempt to start a running timer\n", stderr);
@@ -503,13 +508,12 @@ visc_StartTimer(struct visc_Timer *timer)
     timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "visc_StartTimer: not implemented for this system"
+#error "visc_StartTimer: not implemented for this system"
 #endif
 }
 
-void
-visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer)
-{
+void visc_StartTimerAndSubTimer(struct visc_Timer *timer,
+                                struct visc_Timer *subtimer) {
 
   unsigned int numNotStopped = 0x3; // 11
   if (timer->state != visc_Timer_STOPPED) {
@@ -521,7 +525,7 @@ visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer
     numNotStopped &= 0x2; // Zero out 2^0
   }
   if (numNotStopped == 0x0) {
-    //fputs("Ignoring attempt to start running timer and subtimer\n", stderr);
+    // fputs("Ignoring attempt to start running timer and subtimer\n", stderr);
     return;
   }
 
@@ -542,18 +546,15 @@ visc_StartTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer
     }
   }
 #else
-# error "visc_StartTimer: not implemented for this system"
+#error "visc_StartTimer: not implemented for this system"
 #endif
-
 }
 
-void
-visc_StopTimer(struct visc_Timer *timer)
-{
+void visc_StopTimer(struct visc_Timer *timer) {
   visc_Timestamp fini;
 
   if (timer->state != visc_Timer_RUNNING) {
-    //fputs("Ignoring attempt to stop a stopped timer\n", stderr);
+    // fputs("Ignoring attempt to stop a stopped timer\n", stderr);
     return;
   }
 
@@ -566,14 +567,15 @@ visc_StopTimer(struct visc_Timer *timer)
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "visc_StopTimer: not implemented for this system"
+#error "visc_StopTimer: not implemented for this system"
 #endif
 
   accumulate_time(&timer->elapsed, timer->init, fini);
   timer->init = fini;
 }
 
-void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subtimer) {
+void visc_StopTimerAndSubTimer(struct visc_Timer *timer,
+                               struct visc_Timer *subtimer) {
 
   visc_Timestamp fini;
 
@@ -587,11 +589,10 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subt
     numNotRunning &= 0x2; // Zero out 2^0
   }
   if (numNotRunning == 0x0) {
-    //fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr);
+    // fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr);
     return;
   }
 
-
   timer->state = visc_Timer_STOPPED;
   subtimer->state = visc_Timer_STOPPED;
 
@@ -602,7 +603,7 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subt
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "visc_StopTimer: not implemented for this system"
+#error "visc_StopTimer: not implemented for this system"
 #endif
 
   if (numNotRunning & 0x2) {
@@ -614,13 +615,10 @@ void visc_StopTimerAndSubTimer(struct visc_Timer *timer, struct visc_Timer *subt
     accumulate_time(&subtimer->elapsed, subtimer->init, fini);
     subtimer->init = fini;
   }
-
 }
 
 /* Get the elapsed time in seconds. */
-double
-visc_GetElapsedTime(struct visc_Timer *timer)
-{
+double visc_GetElapsedTime(struct visc_Timer *timer) {
   double ret;
 
   if (timer->state != visc_Timer_STOPPED) {
@@ -630,14 +628,12 @@ visc_GetElapsedTime(struct visc_Timer *timer)
 #if _POSIX_VERSION >= 200112L
   ret = timer->elapsed / 1e9;
 #else
-# error "visc_GetElapsedTime: not implemented for this system"
+#error "visc_GetElapsedTime: not implemented for this system"
 #endif
   return ret;
 }
 
-void
-visc_InitializeTimerSet(struct visc_TimerSet *timers)
-{
+void visc_InitializeTimerSet(struct visc_TimerSet *timers) {
   int n;
 
   timers->wall_begin = get_time();
@@ -651,25 +647,25 @@ visc_InitializeTimerSet(struct visc_TimerSet *timers)
   }
 }
 
+void visc_AddSubTimer(struct visc_TimerSet *timers, char *label,
+                      enum visc_TimerID visc_Category) {
 
-void
-visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID visc_Category) {
-
-  struct visc_SubTimer *subtimer = (struct visc_SubTimer *) malloc
-    (sizeof(struct visc_SubTimer));
+  struct visc_SubTimer *subtimer =
+      (struct visc_SubTimer *)malloc(sizeof(struct visc_SubTimer));
 
   int len = strlen(label);
 
-  subtimer->label = (char *) malloc (sizeof(char)*(len+1));
+  subtimer->label = (char *)malloc(sizeof(char) * (len + 1));
   sprintf(subtimer->label, "%s", label);
 
   visc_ResetTimer(&subtimer->timer);
   subtimer->next = NULL;
 
-  struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[visc_Category];
+  struct visc_SubTimerList *subtimerlist =
+      timers->sub_timer_list[visc_Category];
   if (subtimerlist == NULL) {
-    subtimerlist = (struct visc_SubTimerList *) calloc
-      (1, sizeof(struct visc_SubTimerList));
+    subtimerlist =
+        (struct visc_SubTimerList *)calloc(1, sizeof(struct visc_SubTimerList));
     subtimerlist->subtimer_list = subtimer;
     timers->sub_timer_list[visc_Category] = subtimerlist;
   } else {
@@ -680,22 +676,22 @@ visc_AddSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID vi
     }
     element->next = subtimer;
   }
-
 }
 
-void
-visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
-{
-  //cerr << "Switch to timer: " << timer << flush << "\n";
+void visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer) {
+  // cerr << "Switch to timer: " << timer << flush << "\n";
   /* Stop the currently running timer */
   if (timers->current != visc_TimerID_NONE) {
-    struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-    struct visc_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
+    struct visc_SubTimerList *subtimerlist =
+        timers->sub_timer_list[timers->current];
+    struct visc_SubTimer *currSubTimer =
+        (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
-    if (!is_async(timers->current) ) {
+    if (!is_async(timers->current)) {
       if (timers->current != timer) {
         if (currSubTimer != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
+          visc_StopTimerAndSubTimer(&timers->timers[timers->current],
+                                    &currSubTimer->timer);
         } else {
           visc_StopTimer(&timers->timers[timers->current]);
         }
@@ -717,30 +713,31 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
    * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(timer) ) ) {
+  if (asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(timer))) {
 
-    struct visc_async_time_marker_list * last_event = get_last_async(timers);
+    struct visc_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
     cl_int async_done = CL_COMPLETE;
 
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
+    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker),
+                              CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
+                              &async_done, NULL);
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stdout, "Error Querying EventInfo1!\n");
     }
 
-
-    if(is_blocking(timer)) {
+    if (is_blocking(timer)) {
       /* Async operations completed after previous CPU operations:
        * overlapped time is the total CPU time since this set of async
        * operations were first issued */
 
       // timer to switch to is COPY or NONE
-      if(async_done != CL_COMPLETE) {
+      if (async_done != CL_COMPLETE) {
         accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
-	                  timers->async_begin,currentTime);
+                        timers->async_begin, currentTime);
       }
 
       /* Wait on async operation completion */
@@ -753,25 +750,27 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      if(async_done == CL_COMPLETE) {
-        //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time);
+      if (async_done == CL_COMPLETE) {
+        // fprintf(stderr, "Async_done: total_async_type = %lld\n",
+        // total_async_time);
         timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
       }
 
     } else
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == CL_COMPLETE ) {
+        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
+        // outstanding so something is deeper in stack
+        if (async_done == CL_COMPLETE) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
+      timers->timers[visc_TimerID_OVERLAP].elapsed +=
+          record_async_times(timers);
     }
   }
 
   /* Start the new timer */
   if (timer != visc_TimerID_NONE) {
-    if(!is_async(timer)) {
+    if (!is_async(timer)) {
       visc_StartTimer(&timers->timers[timer]);
     } else {
       // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
@@ -780,13 +779,13 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
 
         insert_marker(timers, timer);
         timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
+      } else if (!is_async(timers->current)) {
         /* Previous asyncs still in flight, but a previous SwitchTo
          * already marked the end of the most recent async operation,
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct visc_async_time_marker_list * last_event = get_last_async(timers);
+        struct visc_async_time_marker_list *last_event = get_last_async(timers);
         last_event->label = NULL;
         last_event->timerID = timer;
       }
@@ -796,20 +795,21 @@ visc_SwitchToTimer(struct visc_TimerSet *timers, enum visc_TimerID timer)
     }
   }
   timers->current = timer;
-
 }
 
-void
-visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_TimerID category)
-{
-  struct visc_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-  struct visc_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
+void visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label,
+                           enum visc_TimerID category) {
+  struct visc_SubTimerList *subtimerlist =
+      timers->sub_timer_list[timers->current];
+  struct visc_SubTimer *curr =
+      (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
   if (timers->current != visc_TimerID_NONE) {
-    if (!is_async(timers->current) ) {
+    if (!is_async(timers->current)) {
       if (timers->current != category) {
         if (curr != NULL) {
-          visc_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
+          visc_StopTimerAndSubTimer(&timers->timers[timers->current],
+                                    &curr->timer);
         } else {
           visc_StopTimer(&timers->timers[timers->current]);
         }
@@ -831,32 +831,35 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
    * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(category) ) ) {
+  if (asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(category))) {
 
-    struct visc_async_time_marker_list * last_event = get_last_async(timers);
+    struct visc_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
     cl_int async_done = CL_COMPLETE;
 
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
+    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker),
+                              CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
+                              &async_done, NULL);
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stdout, "Error Querying EventInfo2!\n");
     }
 
-    if(is_blocking(category)) {
+    if (is_blocking(category)) {
       /* Async operations completed after previous CPU operations:
        * overlapped time is the total CPU time since this set of async
        * operations were first issued */
 
       // timer to switch to is COPY or NONE
-      // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
-      // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
-      // it seems that the extra sync wall time isn't being recorded anywhere
-      if(async_done != CL_COMPLETE)
+      // if it hasn't already finished, then just take now and use that as the
+      // elapsed time in OVERLAP anything happening after now isn't OVERLAP
+      // because everything is being stopped to wait for synchronization it
+      // seems that the extra sync wall time isn't being recorded anywhere
+      if (async_done != CL_COMPLETE)
         accumulate_time(&(timers->timers[visc_TimerID_OVERLAP].elapsed),
-	                  timers->async_begin,currentTime);
+                        timers->async_begin, currentTime);
 
       /* Wait on async operation completion */
       ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
@@ -867,19 +870,21 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-       // If it did finish, then accumulate all the async time that did happen into OVERLAP
-       // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
-      if(async_done == CL_COMPLETE /*cudaSuccess*/)
+      // If it did finish, then accumulate all the async time that did happen
+      // into OVERLAP the immediately preceding EventSynchronize theoretically
+      // didn't have any effect since it was already completed.
+      if (async_done == CL_COMPLETE /*cudaSuccess*/)
         timers->timers[visc_TimerID_OVERLAP].elapsed += total_async_time;
 
     } else
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == CL_COMPLETE /*cudaSuccess*/) {
+        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
+        // outstanding so something is deeper in stack
+        if (async_done == CL_COMPLETE /*cudaSuccess*/) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      timers->timers[visc_TimerID_OVERLAP].elapsed += record_async_times(timers);
+      timers->timers[visc_TimerID_OVERLAP].elapsed +=
+          record_async_times(timers);
     }
     // else, this isn't blocking, so just check the next time around
   }
@@ -900,7 +905,7 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer
 
   /* Start the new timer */
   if (category != visc_TimerID_NONE) {
-    if(!is_async(category)) {
+    if (!is_async(category)) {
       if (subtimerlist != NULL) {
         subtimerlist->current = subtimer;
       }
@@ -922,18 +927,19 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer
         /* No asyncs outstanding, insert a fresh async marker */
         insert_submarker(timers, label, category);
         timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
+      } else if (!is_async(timers->current)) {
         /* Previous asyncs still in flight, but a previous SwitchTo
          * already marked the end of the most recent async operation,
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct visc_async_time_marker_list * last_event = get_last_async(timers);
+        struct visc_async_time_marker_list *last_event = get_last_async(timers);
         last_event->timerID = category;
         last_event->label = label;
       } // else, marker for switchToThis was already inserted
 
-      //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
+      // toSwitchto is already asynchronous, but if current/prev state is async
+      // too, then DRIVER is already running
       if (!is_async(timers->current)) {
         visc_StartTimer(&timers->timers[visc_TimerID_DRIVER]);
       }
@@ -943,39 +949,41 @@ visc_SwitchToSubTimer(struct visc_TimerSet *timers, char *label, enum visc_Timer
   timers->current = category;
 }
 
-void
-visc_PrintTimerSet(struct visc_TimerSet *timers)
-{
+void visc_PrintTimerSet(struct visc_TimerSet *timers) {
   visc_Timestamp wall_end = get_time();
 
   struct visc_Timer *t = timers->timers;
-  struct visc_SubTimer* sub = NULL;
+  struct visc_SubTimer *sub = NULL;
 
   int maxSubLength;
 
   const char *categories[] = {
-    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap",
-    "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free",
-    "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc",
-    "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack"
+      "IO",          "Kernel",         "Copy",         "Driver",
+      "Copy Async",  "Compute",        "Overlap",      "Init_Ctx",
+      "Clear_Ctx",   "Copy_Scalar",    "Copy_Ptr",     "Mem_Free",
+      "Read_Output", "Setup",          "Mem_Track",    "Mem_Untrack",
+      "Misc",        "Pthread_Create", "Arg_Pack",     "Arg_Unpack",
+      "Computation", "Output_Pack",    "Output_Unpack"
 
   };
 
   const int maxCategoryLength = 20;
 
   int i;
-  for(i = 1; i < visc_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format
-    if(visc_GetElapsedTime(&t[i]) != 0 || true) {
+  for (i = 1; i < visc_TimerID_LAST;
+       ++i) { // exclude NONE and OVRELAP from this format
+    if (visc_GetElapsedTime(&t[i]) != 0 || true) {
 
       // Print Category Timer
-      printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], visc_GetElapsedTime(&t[i]));
+      printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1],
+             visc_GetElapsedTime(&t[i]));
 
       if (timers->sub_timer_list[i] != NULL) {
         sub = timers->sub_timer_list[i]->subtimer_list;
         maxSubLength = 0;
         while (sub != NULL) {
           // Find longest SubTimer label
-          if (strlen(sub->label) > (unsigned long) maxSubLength) {
+          if (strlen(sub->label) > (unsigned long)maxSubLength) {
             maxSubLength = strlen(sub->label);
           }
           sub = sub->next;
@@ -983,47 +991,47 @@ visc_PrintTimerSet(struct visc_TimerSet *timers)
 
         // Fit to Categories
         if (maxSubLength <= maxCategoryLength) {
-         maxSubLength = maxCategoryLength;
+          maxSubLength = maxCategoryLength;
         }
 
         sub = timers->sub_timer_list[i]->subtimer_list;
 
         // Print SubTimers
         while (sub != NULL) {
-          printf(" -%-*s: %.9f\n", maxSubLength, sub->label, visc_GetElapsedTime(&sub->timer));
+          printf(" -%-*s: %.9f\n", maxSubLength, sub->label,
+                 visc_GetElapsedTime(&sub->timer));
           sub = sub->next;
         }
       }
     }
   }
 
-  if(visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0)
-    printf("CPU/Kernel Overlap: %.9f\n", visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]));
+  if (visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]) != 0)
+    printf("CPU/Kernel Overlap: %.9f\n",
+           visc_GetElapsedTime(&t[visc_TimerID_OVERLAP]));
 
-  float walltime = (wall_end - timers->wall_begin)/ 1e9;
+  float walltime = (wall_end - timers->wall_begin) / 1e9;
   printf("Timer Wall Time: %.9f\n", walltime);
-
 }
 
-void visc_DestroyTimerSet(struct visc_TimerSet * timers)
-{
+void visc_DestroyTimerSet(struct visc_TimerSet *timers) {
   /* clean up all of the async event markers */
-  struct visc_async_time_marker_list* event = timers->async_markers;
-  while(event != NULL) {
+  struct visc_async_time_marker_list *event = timers->async_markers;
+  while (event != NULL) {
 
     cl_int ciErrNum = CL_SUCCESS;
     ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker);
     if (ciErrNum != CL_SUCCESS) {
-      //fprintf(stderr, "Error Waiting for Events!\n");
+      // fprintf(stderr, "Error Waiting for Events!\n");
     }
 
-    ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) );
+    ciErrNum = clReleaseEvent(*((cl_event *)(event)->marker));
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stderr, "Error Release Events!\n");
     }
 
     free((event)->marker);
-    struct visc_async_time_marker_list* next = ((event)->next);
+    struct visc_async_time_marker_list *next = ((event)->next);
 
     free(event);
 
@@ -1032,7 +1040,7 @@ void visc_DestroyTimerSet(struct visc_TimerSet * timers)
   }
 
   int i = 0;
-  for(i = 0; i < visc_TimerID_LAST; ++i) {
+  for (i = 0; i < visc_TimerID_LAST; ++i) {
     if (timers->sub_timer_list[i] != NULL) {
       struct visc_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
       struct visc_SubTimer *prev = NULL;
@@ -1051,194 +1059,210 @@ void visc_DestroyTimerSet(struct visc_TimerSet * timers)
 #define BUFFER_SIZE 1
 
 // Launch API for a streaming dataflow graph
-void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void* args) {
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
+void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *args) {
+  DFNodeContext_X86 *Context =
+      (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86));
 
   Context->threads = new std::vector<pthread_t>();
   Context->ArgInPortSizeMap = new std::map<unsigned, uint64_t>();
-  //Context->BindInSizes = new std::vector<uint64_t>();
+  // Context->BindInSizes = new std::vector<uint64_t>();
   Context->BindInSourcePort = new std::vector<unsigned>();
   Context->BindOutSizes = new std::vector<uint64_t>();
   Context->EdgeSizes = new std::vector<uint64_t>();
-  Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
-  Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
-  Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t>*>();
-  Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t>*>();
+  Context->BindInputBuffers = new std::vector<CircularBuffer<uint64_t> *>();
+  Context->BindOutputBuffers = new std::vector<CircularBuffer<uint64_t> *>();
+  Context->EdgeBuffers = new std::vector<CircularBuffer<uint64_t> *>();
+  Context->isLastInputBuffers = new std::vector<CircularBuffer<uint64_t> *>();
 
-  DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args << flush << "\n");
+  DEBUG(cout << "StreamLaunch -- Graph: " << Context << ", Arguments: " << args
+             << flush << "\n");
   LaunchFunc(args, Context);
   return Context;
 }
 
 // Push API for a streaming dataflow graph
-void llvm_visc_streamPush(void* graphID, void* args) {
-  DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+void llvm_visc_streamPush(void *graphID, void *args) {
+  DEBUG(cout << "StreamPush -- Graph: " << graphID << ", Arguments: " << args
+             << flush << "\n");
+  DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   unsigned offset = 0;
-  for (unsigned i=0; i< Ctx->ArgInPortSizeMap->size(); i++) {
+  for (unsigned i = 0; i < Ctx->ArgInPortSizeMap->size(); i++) {
     uint64_t element;
-    memcpy(&element, (char*)args+offset, Ctx->ArgInPortSizeMap->at(i));
+    memcpy(&element, (char *)args + offset, Ctx->ArgInPortSizeMap->at(i));
     offset += Ctx->ArgInPortSizeMap->at(i);
-    for(unsigned j=0; j<Ctx->BindInputBuffers->size();j++) {
-      if(Ctx->BindInSourcePort->at(j) == i) {
+    for (unsigned j = 0; j < Ctx->BindInputBuffers->size(); j++) {
+      if (Ctx->BindInSourcePort->at(j) == i) {
         // Push to all bind buffers connected to parent node at this port
-        //DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
+        // DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
         llvm_visc_bufferPush(Ctx->BindInputBuffers->at(j), element);
       }
     }
   }
   // Push 0 in isLastInput buffers of all child nodes
-  for (CircularBuffer<uint64_t>* buffer: *(Ctx->isLastInputBuffers))
+  for (CircularBuffer<uint64_t> *buffer : *(Ctx->isLastInputBuffers))
     llvm_visc_bufferPush(buffer, 0);
 }
 
 // Pop API for a streaming dataflow graph
-void* llvm_visc_streamPop(void* graphID) {
+void *llvm_visc_streamPop(void *graphID) {
   DEBUG(cout << "StreamPop -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+  DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   unsigned totalBytes = 0;
-  for(uint64_t size:  *(Ctx->BindOutSizes))
-    totalBytes+= size;
-  void* output = malloc(totalBytes);
+  for (uint64_t size : *(Ctx->BindOutSizes))
+    totalBytes += size;
+  void *output = malloc(totalBytes);
   unsigned offset = 0;
-  for (unsigned i=0; i< Ctx->BindOutputBuffers->size(); i++) {
+  for (unsigned i = 0; i < Ctx->BindOutputBuffers->size(); i++) {
     uint64_t element = llvm_visc_bufferPop(Ctx->BindOutputBuffers->at(i));
-    //DEBUG(cout << "\tPopped Value " << element << " from buffer\n");
-    memcpy((char*)output+offset, &element, Ctx->BindOutSizes->at(i));
+    // DEBUG(cout << "\tPopped Value " << element << " from buffer\n");
+    memcpy((char *)output + offset, &element, Ctx->BindOutSizes->at(i));
     offset += Ctx->BindOutSizes->at(i);
   }
   return output;
 }
 
 // Wait API for a streaming dataflow graph
-void llvm_visc_streamWait(void* graphID) {
+void llvm_visc_streamWait(void *graphID) {
   DEBUG(cout << "StreamWait -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+  DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   // Push garbage to all other input buffers
-  for (unsigned i=0; i< Ctx->BindInputBuffers->size(); i++) {
+  for (unsigned i = 0; i < Ctx->BindInputBuffers->size(); i++) {
     uint64_t element = 0;
-    //DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
+    // DEBUG(cout << "\tPushing Value " << element << " to buffer\n");
     llvm_visc_bufferPush(Ctx->BindInputBuffers->at(i), element);
   }
   // Push 1 in isLastInput buffers of all child nodes
-  for (unsigned i=0; i < Ctx->isLastInputBuffers->size(); i++)
+  for (unsigned i = 0; i < Ctx->isLastInputBuffers->size(); i++)
     llvm_visc_bufferPush(Ctx->isLastInputBuffers->at(i), 1);
 
   llvm_visc_freeThreads(graphID);
 }
 
 // Create a buffer and return the bufferID
-void* llvm_visc_createBindInBuffer(void* graphID, uint64_t size, unsigned inArgPort) {
-  DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn");
+void *llvm_visc_createBindInBuffer(void *graphID, uint64_t size,
+                                   unsigned inArgPort) {
+  DEBUG(cout << "Create BindInBuffer -- Graph: " << graphID
+             << ", Size: " << size << flush << "\n");
+  DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
+  CircularBuffer<uint64_t> *bufferID =
+      new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindIn");
   DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
   Context->BindInputBuffers->push_back(bufferID);
   (*(Context->ArgInPortSizeMap))[inArgPort] = size;
   Context->BindInSourcePort->push_back(inArgPort);
-  //Context->BindInSizes->push_back(size);
+  // Context->BindInSizes->push_back(size);
   return bufferID;
 }
 
-void* llvm_visc_createBindOutBuffer(void* graphID, uint64_t size) {
-  DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size());
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut");
+void *llvm_visc_createBindOutBuffer(void *graphID, uint64_t size) {
+  DEBUG(cout << "Create BindOutBuffer -- Graph: " << graphID
+             << ", Size: " << size << flush << "\n");
+  DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
+  // Twine name = Twine("Bind.Out.")+Twine(Context->BindOutputBuffers->size());
+  CircularBuffer<uint64_t> *bufferID =
+      new CircularBuffer<uint64_t>(BUFFER_SIZE, "BindOut");
   DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
   Context->BindOutputBuffers->push_back(bufferID);
   Context->BindOutSizes->push_back(size);
   return bufferID;
 }
-void* llvm_visc_createEdgeBuffer(void* graphID, uint64_t size) {
-  DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size());
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge");
+void *llvm_visc_createEdgeBuffer(void *graphID, uint64_t size) {
+  DEBUG(cout << "Create EdgeBuffer -- Graph: " << graphID << ", Size: " << size
+             << flush << "\n");
+  DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
+  // Twine name = Twine("Edge.")+Twine(Context->EdgeBuffers->size());
+  CircularBuffer<uint64_t> *bufferID =
+      new CircularBuffer<uint64_t>(BUFFER_SIZE, "Edge");
   DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
   Context->EdgeBuffers->push_back(bufferID);
   Context->EdgeSizes->push_back(size);
   return bufferID;
 }
 
-void* llvm_visc_createLastInputBuffer(void* graphID, uint64_t size) {
-  DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID << ", Size: " << size << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size());
-  CircularBuffer<uint64_t> *bufferID = new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput");
+void *llvm_visc_createLastInputBuffer(void *graphID, uint64_t size) {
+  DEBUG(cout << "Create isLastInputBuffer -- Graph: " << graphID
+             << ", Size: " << size << flush << "\n");
+  DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
+  // Twine name = Twine("isLastInput.")+Twine(Context->EdgeBuffers->size());
+  CircularBuffer<uint64_t> *bufferID =
+      new CircularBuffer<uint64_t>(BUFFER_SIZE, "LastInput");
   DEBUG(cout << "\tNew Buffer: " << bufferID << flush << "\n");
   Context->isLastInputBuffers->push_back(bufferID);
   return bufferID;
 }
 
-// Free buffers 
-void llvm_visc_freeBuffers(void* graphID) {
+// Free buffers
+void llvm_visc_freeBuffers(void *graphID) {
   DEBUG(cout << "Free all buffers -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->BindInputBuffers))
+  DFNodeContext_X86 *Context = (DFNodeContext_X86 *)graphID;
+  for (CircularBuffer<uint64_t> *bufferID : *(Context->BindInputBuffers))
     delete bufferID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->BindOutputBuffers))
+  for (CircularBuffer<uint64_t> *bufferID : *(Context->BindOutputBuffers))
     delete bufferID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->EdgeBuffers))
+  for (CircularBuffer<uint64_t> *bufferID : *(Context->EdgeBuffers))
     delete bufferID;
-  for(CircularBuffer<uint64_t>* bufferID: *(Context->isLastInputBuffers))
+  for (CircularBuffer<uint64_t> *bufferID : *(Context->isLastInputBuffers))
     delete bufferID;
 }
 
 // Pop an element from the buffer
-uint64_t llvm_visc_bufferPop(void* bufferID) {
-  CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
+uint64_t llvm_visc_bufferPop(void *bufferID) {
+  CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID;
   return buffer->pop();
 }
 
 // Push an element into the buffer
-void llvm_visc_bufferPush(void* bufferID, uint64_t element) {
-  CircularBuffer<uint64_t>* buffer = (CircularBuffer<uint64_t>*) bufferID;
+void llvm_visc_bufferPush(void *bufferID, uint64_t element) {
+  CircularBuffer<uint64_t> *buffer = (CircularBuffer<uint64_t> *)bufferID;
   buffer->push(element);
 }
 
 // Create a thread
-void llvm_visc_createThread(void* graphID, void* (*Func)(void*), void* arguments) {
-  DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func << ", Args: " << arguments << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
+void llvm_visc_createThread(void *graphID, void *(*Func)(void *),
+                            void *arguments) {
+  DEBUG(cout << "Create Thread -- Graph: " << graphID << ", Func: " << Func
+             << ", Args: " << arguments << flush << "\n");
+  DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
   int err;
   pthread_t threadID;
-  if((err = pthread_create(&threadID, NULL, Func, arguments)) != 0)
+  if ((err = pthread_create(&threadID, NULL, Func, arguments)) != 0)
     cout << "Failed to create thread. Error code = " << err << flush << "\n";
 
   Ctx->threads->push_back(threadID);
 }
 
 // Wait for thread to finish
-void llvm_visc_freeThreads(void* graphID) {
+void llvm_visc_freeThreads(void *graphID) {
   DEBUG(cout << "Free Threads -- Graph: " << graphID << flush << "\n");
-  DFNodeContext_X86* Ctx = (DFNodeContext_X86*) graphID;
-  for(pthread_t thread: *(Ctx->threads))
+  DFNodeContext_X86 *Ctx = (DFNodeContext_X86 *)graphID;
+  for (pthread_t thread : *(Ctx->threads))
     pthread_join(thread, NULL);
 }
 
 /************************ OPENCL & PTHREAD API ********************************/
 
-void* llvm_visc_x86_launch(void* (*rootFunc)(void*), void* arguments) {
-  DFNodeContext_X86 *Context = (DFNodeContext_X86*) malloc(sizeof(DFNodeContext_X86));
-  //int err;
-  //if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) != 0)
-    //cout << "Failed to create pthread. Error code = " << err << flush << "\n";
+void *llvm_visc_x86_launch(void *(*rootFunc)(void *), void *arguments) {
+  DFNodeContext_X86 *Context =
+      (DFNodeContext_X86 *)malloc(sizeof(DFNodeContext_X86));
+  // int err;
+  // if((err = pthread_create(&Context->threadID, NULL, rootFunc, arguments)) !=
+  // 0) cout << "Failed to create pthread. Error code = " << err << flush <<
+  // "\n";
   rootFunc(arguments);
   return Context;
 }
 
-void llvm_visc_x86_wait(void* graphID) {
+void llvm_visc_x86_wait(void *graphID) {
   DEBUG(cout << "Waiting for pthread to finish ...\n");
-  //DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
-  //pthread_join(Context->threadID, NULL);
+  // DFNodeContext_X86* Context = (DFNodeContext_X86*) graphID;
+  // pthread_join(Context->threadID, NULL);
   free(graphID);
   DEBUG(cout << "\t... pthread Done!\n");
 }
 
-void* llvm_visc_ocl_initContext(enum visc::Target T) {
+void *llvm_visc_ocl_initContext(enum visc::Target T) {
   pthread_mutex_lock(&ocl_mtx);
-  DEBUG(std::string Target = T == visc::GPU_TARGET? "GPU" : "SPIR");
+  DEBUG(std::string Target = T == visc::GPU_TARGET ? "GPU" : "SPIR");
   DEBUG(cout << "Initializing Context for " << Target << " device\n");
   cl_uint numPlatforms;
   cl_int errcode;
@@ -1246,51 +1270,51 @@ void* llvm_visc_ocl_initContext(enum visc::Target T) {
   checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
 
   // now get all the platform IDs
-  cl_platform_id* platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id)*numPlatforms);
+  cl_platform_id *platforms =
+      (cl_platform_id *)malloc(sizeof(cl_platform_id) * numPlatforms);
   errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
 
-
-  for(unsigned i=0; i < numPlatforms; i++) {
+  for (unsigned i = 0; i < numPlatforms; i++) {
     char buffer[10240];
     DEBUG(cout << "Device " << i << " Info -->\n");
     clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
     DEBUG(cout << "\tPROFILE = " << buffer << flush << "\n");
     clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
-    DEBUG(cout << "\tVERSION = "<< buffer << flush << "\n");
+    DEBUG(cout << "\tVERSION = " << buffer << flush << "\n");
     clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
     DEBUG(cout << "\tNAME = " << buffer << flush << "\n");
     clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
     DEBUG(cout << "\tVENDOR = " << buffer << flush << "\n");
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer,
+                      NULL);
     DEBUG(cout << "\tEXTENSIONS = " << buffer << flush << "\n");
   }
   // set platform property - just pick the first one
-  //cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
-                                        //(long) platforms[0],
-                                        //0};
-  //globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
-                                         //NULL, NULL, &errcode);
+  // cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+  //(long) platforms[0],
+  // 0};
+  // globalOCLContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
+  // NULL, NULL, &errcode);
   // assert(numPlatforms >= 2 && "Expecting two OpenCL platforms");
   // Choose second one which is X86 AVX
-  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
-                                        (long) platforms[T == visc::GPU_TARGET? 0 : 1],
-                                        0};
-  globalOCLContext = clCreateContextFromType(properties,
-                                            T == visc::GPU_TARGET?
-                                              CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU,
-                                       NULL, NULL, &errcode);
+  cl_context_properties properties[] = {
+      CL_CONTEXT_PLATFORM, (long)platforms[T == visc::GPU_TARGET ? 0 : 1], 0};
+  globalOCLContext = clCreateContextFromType(
+      properties,
+      T == visc::GPU_TARGET ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, NULL,
+      NULL, &errcode);
   // get the list of OCL devices associated with context
   size_t dataBytes;
-  errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0,
-                              NULL, &dataBytes);
+  errcode = clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, 0, NULL,
+                             &dataBytes);
   checkErr(errcode, CL_SUCCESS, "Failure to get context info length");
 
-  clDevices = (cl_device_id *) malloc(dataBytes);
+  clDevices = (cl_device_id *)malloc(dataBytes);
   errcode |= clGetContextInfo(globalOCLContext, CL_CONTEXT_DEVICES, dataBytes,
                               clDevices, NULL);
   checkErr(errcode, CL_SUCCESS, "Failure to get context info");
-  if(false && T == visc::SPIR_TARGET) {
+  if (false && T == visc::SPIR_TARGET) {
     cl_device_partition_property props[4];
     props[0] = CL_DEVICE_PARTITION_BY_COUNTS;
     props[1] = NUM_CORES;
@@ -1300,12 +1324,14 @@ void* llvm_visc_ocl_initContext(enum visc::Target T) {
     cl_uint num_entries = 8;
 
     cl_uint numDevices;
-    clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id, &numDevices);
-    //printf("Num of devices = %d\n", numDevices);
-    //for(unsigned i =0 ; i< numDevices; i++)
-      //printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
+    clCreateSubDevices(clDevices[0], props, num_entries, subdevice_id,
+                       &numDevices);
+    // printf("Num of devices = %d\n", numDevices);
+    // for(unsigned i =0 ; i< numDevices; i++)
+    // printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
     clDevices[0] = subdevice_id[0];
-    globalOCLContext = clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode);
+    globalOCLContext =
+        clCreateContext(properties, 1, clDevices, NULL, NULL, &errcode);
     checkErr(errcode, CL_SUCCESS, "Failure to create OCL context");
   }
 
@@ -1320,120 +1346,134 @@ void* llvm_visc_ocl_initContext(enum visc::Target T) {
   return globalOCLContext;
 }
 
-void llvm_visc_ocl_clearContext(void* graphID) {
+void llvm_visc_ocl_clearContext(void *graphID) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Clear Context\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
   // FIXME: Have separate function to release command queue and clear context.
   // Would be useful when a context has multiple command queues
   clReleaseKernel(Context->clKernel);
-  //clReleaseProgram(Context->clProgram);
-  //clReleaseCommandQueue(Context->clCommandQue);
-  //clReleaseContext(globalOCLContext);
-  //DEBUG(cout << "Released context at: " << globalOCLContext);
+  // clReleaseProgram(Context->clProgram);
+  // clReleaseCommandQueue(Context->clCommandQue);
+  // clReleaseContext(globalOCLContext);
+  // DEBUG(cout << "Released context at: " << globalOCLContext);
   free(Context);
   DEBUG(cout << "Done with OCL kernel\n");
   cout << "Printing VISC Timer: KernelTimer\n";
   visc_PrintTimerSet(&kernel_timer);
   pthread_mutex_unlock(&ocl_mtx);
-
 }
 
-void llvm_visc_ocl_argument_shared(void* graphID, int arg_index, size_t size) {
+void llvm_visc_ocl_argument_shared(void *graphID, int arg_index, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Shared Memory Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
+             << flush << "\n");
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
   DEBUG(cout << "Using Context: " << Context << flush << "\n");
   DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
-  //pthread_mutex_lock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, NULL);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to set shared memory argument");
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_ocl_argument_scalar(void* graphID, void* input, int arg_index, size_t size) {
+void llvm_visc_ocl_argument_scalar(void *graphID, void *input, int arg_index,
+                                   size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Scalar Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
+             << flush << "\n");
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
   DEBUG(cout << "Using Context: " << Context << flush << "\n");
   DEBUG(cout << "Using clKernel: " << Context->clKernel << flush << "\n");
-  //pthread_mutex_lock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
   cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, size, input);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to set constant input argument");
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void* llvm_visc_ocl_argument_ptr(void* graphID, void* input, int arg_index, size_t size, bool isInput, bool isOutput) {
+void *llvm_visc_ocl_argument_ptr(void *graphID, void *input, int arg_index,
+                                 size_t size, bool isInput, bool isOutput) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set Pointer Input:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input << ", Size = "<< size << flush << "\n");
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Ptr = " << input
+             << ", Size = " << size << flush << "\n");
   // Size should be non-zero
   assert(size != 0 && "Size of data pointed to has to be non-zero!");
-  DEBUG(cout << "\tInput = "<< isInput << "\tOutput = " << isOutput << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  
+  DEBUG(cout << "\tInput = " << isInput << "\tOutput = " << isOutput << flush
+             << "\n");
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
+
   pthread_mutex_unlock(&ocl_mtx);
-  // Check with runtime the location of this memory 
-  cl_mem d_input = (cl_mem) llvm_visc_ocl_request_mem(input, size, Context, isInput, isOutput);
-  
+  // Check with runtime the location of this memory
+  cl_mem d_input = (cl_mem)llvm_visc_ocl_request_mem(input, size, Context,
+                                                     isInput, isOutput);
+
   pthread_mutex_lock(&ocl_mtx);
   // Set Kernel Argument
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_input);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem),
+                                  (void *)&d_input);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
   DEBUG(cout << "\tDevicePtr = " << d_input << flush << "\n");
   pthread_mutex_unlock(&ocl_mtx);
   return d_input;
 }
 
-void* llvm_visc_ocl_output_ptr(void* graphID, int arg_index, size_t size) {
+void *llvm_visc_ocl_output_ptr(void *graphID, int arg_index, size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Set device memory for Output Struct:");
-  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = "<< size << flush << "\n");
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DEBUG(cout << "\tArgument Index = " << arg_index << ", Size = " << size
+             << flush << "\n");
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
   cl_int errcode;
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY, size, NULL, &errcode);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  cl_mem d_output = clCreateBuffer(Context->clOCLContext, CL_MEM_WRITE_ONLY,
+                                   size, NULL, &errcode);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to create output buffer on device");
-  //pthread_mutex_lock(&ocl_mtx);
-  errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem), (void*)&d_output);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  errcode = clSetKernelArg(Context->clKernel, arg_index, sizeof(cl_mem),
+                           (void *)&d_output);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to set pointer argument");
   DEBUG(cout << "\tDevicePtr = " << d_output << flush << "\n");
   pthread_mutex_unlock(&ocl_mtx);
   return d_output;
 }
 
-void llvm_visc_ocl_free(void* ptr) {
-  //DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n");
-  //cl_mem d_ptr = (cl_mem) ptr;
-  //clReleaseMemObject(d_ptr);
+void llvm_visc_ocl_free(void *ptr) {
+  // DEBUG(cout << "Release Device Pointer: " << ptr << flush << "\n");
+  // cl_mem d_ptr = (cl_mem) ptr;
+  // clReleaseMemObject(d_ptr);
 }
 
-void* llvm_visc_ocl_getOutput(void* graphID, void* h_output, void* d_output, size_t size) {
+void *llvm_visc_ocl_getOutput(void *graphID, void *h_output, void *d_output,
+                              size_t size) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Get Output:\n");
-  DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output << ", Size = "<< size << flush << "\n");
-  if(h_output == NULL)
+  DEBUG(cout << "\tHostPtr = " << h_output << ", DevicePtr = " << d_output
+             << ", Size = " << size << flush << "\n");
+  if (h_output == NULL)
     h_output = malloc(size);
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
-  //pthread_mutex_lock(&ocl_mtx);
-  cl_int errcode = clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0, size,
-                                h_output, 0, NULL, NULL);
-  //pthread_mutex_unlock(&ocl_mtx);
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
+  // pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode =
+      clEnqueueReadBuffer(Context->clCommandQue, (cl_mem)d_output, CL_TRUE, 0,
+                          size, h_output, 0, NULL, NULL);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "[getOutput] Failure to read output");
   pthread_mutex_unlock(&ocl_mtx);
   return h_output;
 }
 
-void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t*
-                                localWorkSize, const size_t* globalWorkSize) {
+void *llvm_visc_ocl_executeNode(void *graphID, unsigned workDim,
+                                const size_t *localWorkSize,
+                                const size_t *globalWorkSize) {
   pthread_mutex_lock(&ocl_mtx);
 
   size_t GlobalWG[3];
@@ -1442,60 +1482,60 @@ void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t*
   // OpenCL EnqeueNDRangeKernel function results in segementation fault if we
   // directly use local and global work groups arguments. Hence, allocating it
   // on stack and copying.
-  for(unsigned i=0; i<workDim; i++) {
+  for (unsigned i = 0; i < workDim; i++) {
     GlobalWG[i] = globalWorkSize[i];
   }
 
   // OpenCL allows local workgroup to be null.
-  if(localWorkSize != NULL) {
-    for(unsigned i=0; i<workDim; i++) {
+  if (localWorkSize != NULL) {
+    for (unsigned i = 0; i < workDim; i++) {
       LocalWG[i] = localWorkSize[i];
     }
   }
 
-  DFNodeContext_OCL* Context = (DFNodeContext_OCL*) graphID;
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
   // TODO: Would like to use event to ensure better scheduling of kernels.
   // Currently passing the event paratemeter results in seg fault with
   // clEnqueueNDRangeKernel.
-  cl_event* event;
+  cl_event *event;
   DEBUG(cout << "Enqueuing kernel:\n");
   DEBUG(cout << "\tCommand Queue: " << Context->clCommandQue << flush << "\n");
   DEBUG(cout << "\tKernel: " << Context->clKernel << flush << "\n");
   DEBUG(cout << "\tNumber of dimensions: " << workDim << flush << "\n");
   DEBUG(cout << "\tGlobal Work Group: ( ");
-  for(unsigned i = 0; i<workDim; i++) {
+  for (unsigned i = 0; i < workDim; i++) {
     DEBUG(cout << GlobalWG[i] << " ");
   }
   DEBUG(cout << ")\n");
-  if(localWorkSize != NULL) {
+  if (localWorkSize != NULL) {
     DEBUG(cout << "\tLocal Work Group: ( ");
-    for(unsigned i = 0; i<workDim; i++) {
+    for (unsigned i = 0; i < workDim; i++) {
       DEBUG(cout << LocalWG[i] << " ");
     }
     DEBUG(cout << ")\n");
   }
-  //pthread_mutex_lock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
   clFinish(Context->clCommandQue);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_unlock(&ocl_mtx);
   visc_SwitchToTimer(&kernel_timer, visc_TimerID_COMPUTATION);
-  //for(int i=0 ;i < NUM_TESTS; i++) {
-    //cout << "Iteration = " << i << flush << "\n";
-    //pthread_mutex_lock(&ocl_mtx);
-    cl_int errcode = clEnqueueNDRangeKernel(Context->clCommandQue,
-        Context->clKernel, workDim, NULL, GlobalWG, (localWorkSize == NULL)? NULL :  LocalWG, 0, NULL, NULL);
-    //pthread_mutex_unlock(&ocl_mtx);
-    checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
+  // for(int i=0 ;i < NUM_TESTS; i++) {
+  // cout << "Iteration = " << i << flush << "\n";
+  // pthread_mutex_lock(&ocl_mtx);
+  cl_int errcode = clEnqueueNDRangeKernel(
+      Context->clCommandQue, Context->clKernel, workDim, NULL, GlobalWG,
+      (localWorkSize == NULL) ? NULL : LocalWG, 0, NULL, NULL);
+  // pthread_mutex_unlock(&ocl_mtx);
+  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
   //}
-  //pthread_mutex_lock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
   clFinish(Context->clCommandQue);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_unlock(&ocl_mtx);
   visc_SwitchToTimer(&kernel_timer, visc_TimerID_NONE);
-  
+
   pthread_mutex_unlock(&ocl_mtx);
   return event;
 }
 
-
 //////////////////////////////////////////////////////////////////////////////
 //! Loads a Program binary file.
 //!
@@ -1503,17 +1543,15 @@ void* llvm_visc_ocl_executeNode(void* graphID, unsigned workDim , const size_t*
 //! @param Filename        program filename
 //! @param szFinalLength    returned length of the code string
 //////////////////////////////////////////////////////////////////////////////
-static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
-{
+static char *LoadProgSource(const char *Filename, size_t *szFinalLength) {
   DEBUG(cout << "Load Prog Source\n");
   // locals
-  FILE* pFileStream = NULL;
+  FILE *pFileStream = NULL;
   size_t szSourceLength;
 
   // open the OpenCL source code file
   pFileStream = fopen(Filename, "rb");
-  if(pFileStream == 0)
-  {
+  if (pFileStream == 0) {
     return NULL;
   }
 
@@ -1523,32 +1561,32 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
   fseek(pFileStream, 0, SEEK_SET);
 
   // allocate a buffer for the source code string and read it in
-  char* cSourceString = (char *)malloc(szSourceLength + 1);
-  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
-  {
-      fclose(pFileStream);
-      free(cSourceString);
-      return 0;
+  char *cSourceString = (char *)malloc(szSourceLength + 1);
+  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) {
+    fclose(pFileStream);
+    free(cSourceString);
+    return 0;
   }
 
-  // close the file and return the total length of the combined (preamble + source) string
+  // close the file and return the total length of the combined (preamble +
+  // source) string
   fclose(pFileStream);
-  if(szFinalLength != 0)
-  {
-      *szFinalLength = szSourceLength;
+  if (szFinalLength != 0) {
+    *szFinalLength = szSourceLength;
   }
   cSourceString[szSourceLength] = '\0';
 
   return cSourceString;
 }
 
-void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) {
+void *llvm_visc_ocl_launch(const char *FileName, const char *KernelName) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Launch OCL Kernel\n");
   // Initialize OpenCL
 
   // OpenCL specific variables
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL *) malloc(sizeof(DFNodeContext_OCL));
+  DFNodeContext_OCL *Context =
+      (DFNodeContext_OCL *)malloc(sizeof(DFNodeContext_OCL));
 
   size_t kernelLength;
   cl_int errcode;
@@ -1556,36 +1594,42 @@ void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) {
   // For a single context for all kernels
   Context->clOCLContext = globalOCLContext;
 
-  //Create a command-queue
-  //pthread_mutex_lock(&ocl_mtx);
-  Context->clCommandQue = clCreateCommandQueue(Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);
+  // Create a command-queue
+  // pthread_mutex_lock(&ocl_mtx);
+  Context->clCommandQue = clCreateCommandQueue(
+      Context->clOCLContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode);
   globalCommandQue = Context->clCommandQue;
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
 
   DEBUG(cout << "Loading program binary: " << FileName << flush << "\n");
   char *programSource = LoadProgSource(FileName, &kernelLength);
-  checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+  checkErr(programSource != NULL, 1 /*bool true*/,
+           "Failure to load Program Binary");
 
   cl_int binaryStatus;
-  //pthread_mutex_lock(&ocl_mtx);
-  Context->clProgram = clCreateProgramWithSource(Context->clOCLContext, 1, (const char **)&programSource, NULL, &errcode);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_lock(&ocl_mtx);
+  Context->clProgram = clCreateProgramWithSource(
+      Context->clOCLContext, 1, (const char **)&programSource, NULL, &errcode);
+  // pthread_mutex_unlock(&ocl_mtx);
   checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
-  DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName << flush << "\n");
-  errcode = clBuildProgram(Context->clProgram, 1, &clDevices[0], "", NULL, NULL);
+  DEBUG(cout << "Building kernel - " << KernelName << " from file " << FileName
+             << flush << "\n");
+  errcode =
+      clBuildProgram(Context->clProgram, 1, &clDevices[0], "", NULL, NULL);
   // If build fails, get build log from device
-  if(errcode != CL_SUCCESS) {
+  if (errcode != CL_SUCCESS) {
     cout << "ERROR: Failure to build program\n";
     size_t len = 0;
-    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0] , CL_PROGRAM_BUILD_LOG, 0,
-        NULL, &len);
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0],
+                                    CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
     cout << "LOG LENGTH: " << len << flush << "\n";
-    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
-    char *log = (char*) malloc(len*sizeof(char));
-    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0], CL_PROGRAM_BUILD_LOG, len,
-        log, NULL);
+    checkErr(errcode, CL_SUCCESS,
+             "Failure to collect program build log length");
+    char *log = (char *)malloc(len * sizeof(char));
+    errcode = clGetProgramBuildInfo(Context->clProgram, clDevices[0],
+                                    CL_PROGRAM_BUILD_LOG, len, log, NULL);
     checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
 
     cout << "Device Build Log:\n" << log << flush << "\n";
@@ -1598,48 +1642,44 @@ void* llvm_visc_ocl_launch(const char* FileName, const char* KernelName) {
   checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
 
   DEBUG(cout << "Kernel ID = " << Context->clKernel << "\n");
-  //free(clDevices);
+  // free(clDevices);
   free(programSource);
 
   pthread_mutex_unlock(&ocl_mtx);
   return Context;
 }
 
-
-void llvm_visc_ocl_wait(void* graphID) {
+void llvm_visc_ocl_wait(void *graphID) {
   pthread_mutex_lock(&ocl_mtx);
   DEBUG(cout << "Wait\n");
-  DFNodeContext_OCL *Context = (DFNodeContext_OCL*) graphID;
-  //pthread_mutex_lock(&ocl_mtx);
+  DFNodeContext_OCL *Context = (DFNodeContext_OCL *)graphID;
+  // pthread_mutex_lock(&ocl_mtx);
   clFinish(Context->clCommandQue);
-  //pthread_mutex_unlock(&ocl_mtx);
+  // pthread_mutex_unlock(&ocl_mtx);
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID timer) {
-  //cout << "Switching to timer " << timer << flush << "\n";
+void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID timer) {
+  // cout << "Switching to timer " << timer << flush << "\n";
   pthread_mutex_lock(&ocl_mtx);
-  //visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer);
+  // visc_SwitchToTimer((visc_TimerSet*)(*timerSet), timer);
   pthread_mutex_unlock(&ocl_mtx);
 }
-void llvm_visc_printTimerSet(void** timerSet, char* timerName) {
+void llvm_visc_printTimerSet(void **timerSet, char *timerName) {
   pthread_mutex_lock(&ocl_mtx);
   cout << "Printing VISC Timer: ";
-  if(timerName != NULL)
+  if (timerName != NULL)
     cout << timerName << flush << "\n";
   else
     cout << "Anonymous\n";
-  visc_PrintTimerSet((visc_TimerSet*) (*timerSet));
+  visc_PrintTimerSet((visc_TimerSet *)(*timerSet));
   pthread_mutex_unlock(&ocl_mtx);
 }
 
-void* llvm_visc_initializeTimerSet() {
+void *llvm_visc_initializeTimerSet() {
   pthread_mutex_lock(&ocl_mtx);
-  visc_TimerSet* TS = (visc_TimerSet*) malloc (sizeof(visc_TimerSet));
+  visc_TimerSet *TS = (visc_TimerSet *)malloc(sizeof(visc_TimerSet));
   visc_InitializeTimerSet(TS);
   pthread_mutex_unlock(&ocl_mtx);
   return TS;
 }
-
-
-
diff --git a/hpvm/projects/visc-rt/visc-rt.h b/hpvm/projects/visc-rt/visc-rt.h
index 9eab8f8b291966c021b42cb0ff0bbdf169772168..3ad315768bf90584a68c1d620ac68936e62a17f0 100644
--- a/hpvm/projects/visc-rt/visc-rt.h
+++ b/hpvm/projects/visc-rt/visc-rt.h
@@ -5,12 +5,12 @@
 #ifndef VISC_RT_HEADER
 #define VISC_RT_HEADER
 
+#include <ctime>
 #include <iostream>
 #include <map>
-#include <ctime>
-#include <vector>
 #include <pthread.h>
 #include <string>
+#include <vector>
 //#include <condition_variable>
 
 #include "../../include/SupportVISC/VISCHint.h"
@@ -19,13 +19,12 @@
 #include "policy.h"
 
 #ifndef DEBUG_BUILD
-#define DEBUG(s) {}
+#define DEBUG(s)                                                               \
+  {}
 #else
 #define DEBUG(s) s
 #endif
 
-
-
 using namespace std;
 
 extern "C" {
@@ -43,263 +42,250 @@ void llvm_visc_deviceAbstraction_waitOnDeviceStatus();
 
 /********************* DFG Depth Stack **********************************/
 class DFGDepth {
-  private:
-    unsigned numDim;
-    unsigned dimLimit[3];
-    unsigned dimInstance[3];
-  public:
-    DFGDepth() = default;
-
-    DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0, unsigned iY = 0,
-        unsigned dimZ = 0, unsigned iZ = 0) {
-      assert(n <= 3 && "Error! More than 3 dimensions not supported");
-      numDim = n;
-      dimLimit[0] = dimX;
-      dimLimit[1] = dimY;
-      dimLimit[2] = dimZ;
-      dimInstance[0] = iX;
-      dimInstance[1] = iY;
-      dimInstance[2] = iZ;
-    }
+private:
+  unsigned numDim;
+  unsigned dimLimit[3];
+  unsigned dimInstance[3];
 
-    unsigned getDimLimit(unsigned dim) const {
-      assert(dim <= numDim && "Error! Requested dimension limit is not specified");
-      return dimLimit[dim];
-    }
+public:
+  DFGDepth() = default;
+
+  DFGDepth(unsigned n, unsigned dimX = 0, unsigned iX = 0, unsigned dimY = 0,
+           unsigned iY = 0, unsigned dimZ = 0, unsigned iZ = 0) {
+    assert(n <= 3 && "Error! More than 3 dimensions not supported");
+    numDim = n;
+    dimLimit[0] = dimX;
+    dimLimit[1] = dimY;
+    dimLimit[2] = dimZ;
+    dimInstance[0] = iX;
+    dimInstance[1] = iY;
+    dimInstance[2] = iZ;
+  }
 
-    unsigned getDimInstance(unsigned dim) const {
-      assert(dim <= numDim && "Error! Requested dimension instance is not specified");
-      return dimInstance[dim];
-    }
+  unsigned getDimLimit(unsigned dim) const {
+    assert(dim <= numDim &&
+           "Error! Requested dimension limit is not specified");
+    return dimLimit[dim];
+  }
 
-    unsigned getNumDim() const {
-      return numDim;
-    }
+  unsigned getDimInstance(unsigned dim) const {
+    assert(dim <= numDim &&
+           "Error! Requested dimension instance is not specified");
+    return dimInstance[dim];
+  }
+
+  unsigned getNumDim() const { return numDim; }
 };
 
 void llvm_visc_x86_dstack_push(unsigned n, uint64_t limitX = 0, uint64_t iX = 0,
-    uint64_t limitY = 0, uint64_t iY = 0, uint64_t limitZ = 0, uint64_t iZ = 0);
+                               uint64_t limitY = 0, uint64_t iY = 0,
+                               uint64_t limitZ = 0, uint64_t iZ = 0);
 void llvm_visc_x86_dstack_pop();
 uint64_t llvm_visc_x86_getDimLimit(unsigned level, unsigned dim);
 uint64_t llvm_visc_x86_getDimInstance(unsigned level, unsigned dim);
 
-
 /********************* Memory Tracker **********************************/
 class MemTrackerEntry {
 public:
-  enum Location {HOST, DEVICE};
-  private:
-    size_t size;
-    Location loc;
-    void* addr;
-    void* Context;
-
-  public:
-    MemTrackerEntry(size_t _size, Location _loc, void* _addr, void* _Context):
-      size(_size), loc(_loc), addr(_addr), Context(_Context) {
-    }
+  enum Location { HOST, DEVICE };
 
-    size_t getSize() const {
-      return size;
-    }
+private:
+  size_t size;
+  Location loc;
+  void *addr;
+  void *Context;
 
-    Location getLocation() const {
-      return loc;
-    }
+public:
+  MemTrackerEntry(size_t _size, Location _loc, void *_addr, void *_Context)
+      : size(_size), loc(_loc), addr(_addr), Context(_Context) {}
 
-    void* getAddress() const {
-      return addr;
-    }
+  size_t getSize() const { return size; }
 
-    void* getContext() const {
-      return Context;
-    }
+  Location getLocation() const { return loc; }
 
-    void update(Location _loc, void* _addr, void* _Context = NULL) {
-      loc = _loc;
-      addr = _addr;
-      Context = _Context;
-    }
+  void *getAddress() const { return addr; }
 
-    void print() {
-      cout << "Size = " << size << "\tLocation = " << loc << "\tAddress = " << addr << "\tContext = " << Context;
-    }
-};
+  void *getContext() const { return Context; }
+
+  void update(Location _loc, void *_addr, void *_Context = NULL) {
+    loc = _loc;
+    addr = _addr;
+    Context = _Context;
+  }
 
+  void print() {
+    cout << "Size = " << size << "\tLocation = " << loc
+         << "\tAddress = " << addr << "\tContext = " << Context;
+  }
+};
 
 class MemTracker {
 
 private:
-  std::map<void*, MemTrackerEntry*> Table;
+  std::map<void *, MemTrackerEntry *> Table;
 
 public:
-  MemTracker() {
-  }
+  MemTracker() {}
 
-  bool insert(void* ID, size_t size, MemTrackerEntry::Location loc, void* addr, void* Context = NULL) {
-    MemTrackerEntry* MTE = new MemTrackerEntry(size, loc, addr, Context);
-    Table.insert(std::pair<void*, MemTrackerEntry*>(ID, MTE));
+  bool insert(void *ID, size_t size, MemTrackerEntry::Location loc, void *addr,
+              void *Context = NULL) {
+    MemTrackerEntry *MTE = new MemTrackerEntry(size, loc, addr, Context);
+    Table.insert(std::pair<void *, MemTrackerEntry *>(ID, MTE));
     return MTE != NULL;
   }
 
-  MemTrackerEntry* lookup(void* ID) {
-    if(Table.count(ID) == 0)
+  MemTrackerEntry *lookup(void *ID) {
+    if (Table.count(ID) == 0)
       return NULL;
     return Table[ID];
   }
 
-  void remove(void* ID) {
-    MemTrackerEntry* MTE = Table[ID];
+  void remove(void *ID) {
+    MemTrackerEntry *MTE = Table[ID];
     free(MTE);
     Table.erase(ID);
   }
 
   void print() {
     cout << "Printing Table ... Size = " << Table.size() << flush << "\n";
-    for(auto& Entry: Table) {
-      cout << Entry.first << ":\t" ;
+    for (auto &Entry : Table) {
+      cout << Entry.first << ":\t";
       Entry.second->print();
       cout << flush << "\n";
     }
   }
-
 };
 
-void llvm_visc_track_mem(void*, size_t);
-void llvm_visc_untrack_mem(void*);
-void* llvm_visc_request_mem(void*, size_t);
+void llvm_visc_track_mem(void *, size_t);
+void llvm_visc_untrack_mem(void *);
+void *llvm_visc_request_mem(void *, size_t);
 
 /*********************** OPENCL & PTHREAD API **************************/
-void* llvm_visc_x86_launch(void* (void*), void*);
-void llvm_visc_x86_wait(void*);
-void* llvm_visc_ocl_initContext(enum visc::Target);
-
-void* llvm_visc_x86_argument_ptr(void*, size_t);
-
-void llvm_visc_ocl_clearContext(void*);
-void llvm_visc_ocl_argument_shared(void*, int, size_t);
-void llvm_visc_ocl_argument_scalar(void*, void*, int, size_t);
-void* llvm_visc_ocl_argument_ptr(void*, void*, int, size_t, bool, bool);
-void* llvm_visc_ocl_output_ptr(void*, int, size_t);
-void llvm_visc_ocl_free(void*);
-void* llvm_visc_ocl_getOutput(void*, void*, void*, size_t);
-void* llvm_visc_ocl_executeNode(void*, unsigned, const size_t*, const size_t*);
-void* llvm_visc_ocl_launch(const char*, const char*);
-void llvm_visc_ocl_wait(void*);
-
-void llvm_visc_switchToTimer(void** timerSet, enum visc_TimerID);
-void llvm_visc_printTimerSet(void** timerSet, char* timerName = NULL);
-void* llvm_visc_initializeTimerSet();
-
+void *llvm_visc_x86_launch(void *(void *), void *);
+void llvm_visc_x86_wait(void *);
+void *llvm_visc_ocl_initContext(enum visc::Target);
+
+void *llvm_visc_x86_argument_ptr(void *, size_t);
+
+void llvm_visc_ocl_clearContext(void *);
+void llvm_visc_ocl_argument_shared(void *, int, size_t);
+void llvm_visc_ocl_argument_scalar(void *, void *, int, size_t);
+void *llvm_visc_ocl_argument_ptr(void *, void *, int, size_t, bool, bool);
+void *llvm_visc_ocl_output_ptr(void *, int, size_t);
+void llvm_visc_ocl_free(void *);
+void *llvm_visc_ocl_getOutput(void *, void *, void *, size_t);
+void *llvm_visc_ocl_executeNode(void *, unsigned, const size_t *,
+                                const size_t *);
+void *llvm_visc_ocl_launch(const char *, const char *);
+void llvm_visc_ocl_wait(void *);
+
+void llvm_visc_switchToTimer(void **timerSet, enum visc_TimerID);
+void llvm_visc_printTimerSet(void **timerSet, char *timerName = NULL);
+void *llvm_visc_initializeTimerSet();
 }
 
 /*************************** Pipeline API ******************************/
 // Circular Buffer class
 unsigned counter = 0;
-template <class ElementType>
-class CircularBuffer {
+template <class ElementType> class CircularBuffer {
 private:
-    int numElements;
-    int bufferSize;
-    int Head;
-    int Tail;
-    pthread_mutex_t mtx;
-    pthread_cond_t cv;
-    vector<ElementType> buffer;
-    std::string name;
-    unsigned ID;
+  int numElements;
+  int bufferSize;
+  int Head;
+  int Tail;
+  pthread_mutex_t mtx;
+  pthread_cond_t cv;
+  vector<ElementType> buffer;
+  std::string name;
+  unsigned ID;
 
 public:
-    CircularBuffer(int maxElements, std::string _name =  "ANON") {
-        ID = counter;
-        Head = 0;
-        Tail = 0;
-        numElements = 0;
-        name = _name;
-        bufferSize = maxElements+1;
-        buffer.reserve(bufferSize);
-        pthread_mutex_init(&mtx, NULL);
-        pthread_cond_init(&cv, NULL);
-        counter++;
-
-    }
-
-    bool push(ElementType E);
-    ElementType pop();
+  CircularBuffer(int maxElements, std::string _name = "ANON") {
+    ID = counter;
+    Head = 0;
+    Tail = 0;
+    numElements = 0;
+    name = _name;
+    bufferSize = maxElements + 1;
+    buffer.reserve(bufferSize);
+    pthread_mutex_init(&mtx, NULL);
+    pthread_cond_init(&cv, NULL);
+    counter++;
+  }
 
+  bool push(ElementType E);
+  ElementType pop();
 };
 
 template <class ElementType>
 bool CircularBuffer<ElementType>::push(ElementType E) {
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush << "\n");
-    //unique_lock<mutex> lk(mtx);
-    pthread_mutex_lock(&mtx);
-    if((Head +1) % bufferSize == Tail) {
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep ...\n");
-        //cv.wait(lk);
-        pthread_cond_wait(&cv, &mtx);
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n");
-    }
-    buffer[Head] = E;
-    Head = (Head+1) % bufferSize;
-    numElements++;
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
-    //lk.unlock();
-    pthread_mutex_unlock(&mtx);
-    //cv.notify_one();
-    pthread_cond_signal(&cv);
-    return true;
+  // DEBUG(cout << name << " Buffer[" << ID << "]: Push " << E << flush <<
+  // "\n"); unique_lock<mutex> lk(mtx);
+  pthread_mutex_lock(&mtx);
+  if ((Head + 1) % bufferSize == Tail) {
+    // DEBUG(cout << name << " Buffer[" << ID << "]: Push going to sleep
+    // ...\n"); cv.wait(lk);
+    pthread_cond_wait(&cv, &mtx);
+    // DEBUG(cout << name << " Buffer[" << ID << "]: Push woke up\n");
+  }
+  buffer[Head] = E;
+  Head = (Head + 1) % bufferSize;
+  numElements++;
+  // DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " <<
+  // numElements << flush << "\n"); lk.unlock();
+  pthread_mutex_unlock(&mtx);
+  // cv.notify_one();
+  pthread_cond_signal(&cv);
+  return true;
 }
 
-template <class ElementType>
-ElementType CircularBuffer<ElementType>::pop() {
-    //unique_lock<mutex> lk(mtx);
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n");
-    pthread_mutex_lock(&mtx);
-    if(Tail == Head) {
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n");
-        //cv.wait(lk);
-        pthread_cond_wait(&cv, &mtx);
-        //DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n");
-    }
-    ElementType E = buffer[Tail];
-    Tail = (Tail + 1) % bufferSize;
-    numElements--;
-    //DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " << numElements << flush << "\n");
-    //lk.unlock();
-    pthread_mutex_unlock(&mtx);
-    //cv.notify_one();
-    pthread_cond_signal(&cv);
-    return E;
+template <class ElementType> ElementType CircularBuffer<ElementType>::pop() {
+  // unique_lock<mutex> lk(mtx);
+  // DEBUG(cout << name << " Buffer[" << ID << "]: Pop\n");
+  pthread_mutex_lock(&mtx);
+  if (Tail == Head) {
+    // DEBUG(cout << name << " Buffer[" << ID << "]: Pop going to sleep ...\n");
+    // cv.wait(lk);
+    pthread_cond_wait(&cv, &mtx);
+    // DEBUG(cout << name << " Buffer[" << ID << "]: Pop woke up\n");
+  }
+  ElementType E = buffer[Tail];
+  Tail = (Tail + 1) % bufferSize;
+  numElements--;
+  // DEBUG(cout << name << " Buffer[" << ID << "]: Total Elements = " <<
+  // numElements << flush << "\n"); lk.unlock();
+  pthread_mutex_unlock(&mtx);
+  // cv.notify_one();
+  pthread_cond_signal(&cv);
+  return E;
 }
 
 extern "C" {
 // Functions to push and pop values from pipeline buffers
-uint64_t llvm_visc_bufferPop(void*);
-void llvm_visc_bufferPush(void*, uint64_t);
+uint64_t llvm_visc_bufferPop(void *);
+void llvm_visc_bufferPush(void *, uint64_t);
 
 // Functions to create and destroy buffers
-void* llvm_visc_createBindInBuffer(void*, uint64_t, unsigned);
-void* llvm_visc_createBindOutBuffer(void*, uint64_t);
-void* llvm_visc_createEdgeBuffer(void*, uint64_t);
-void* llvm_visc_createLastInputBuffer(void*, uint64_t);
+void *llvm_visc_createBindInBuffer(void *, uint64_t, unsigned);
+void *llvm_visc_createBindOutBuffer(void *, uint64_t);
+void *llvm_visc_createEdgeBuffer(void *, uint64_t);
+void *llvm_visc_createLastInputBuffer(void *, uint64_t);
 
-void llvm_visc_freeBuffers(void*);
+void llvm_visc_freeBuffers(void *);
 
 // Functions to create and destroy threads
-void llvm_visc_createThread(void* graphID, void*(*Func)(void*), void*);
-void llvm_visc_freeThreads(void*);
+void llvm_visc_createThread(void *graphID, void *(*Func)(void *), void *);
+void llvm_visc_freeThreads(void *);
 
 // Launch API for a streaming graph.
 // Arguments:
 // (1) Launch Function: void* (void*, void*)
 // (2) Push Function:   void (void*, std::vector<uint64_t>**, unsgined)
 // (3) Pop Function:    void* (std::vector<uint64_t>**, unsigned)
-void* llvm_visc_streamLaunch(void(*LaunchFunc)(void*, void*), void*);
-void llvm_visc_streamPush(void* graphID, void* args);
-void* llvm_visc_streamPop(void* graphID);
-void llvm_visc_streamWait(void* graphID);
-
+void *llvm_visc_streamLaunch(void (*LaunchFunc)(void *, void *), void *);
+void llvm_visc_streamPush(void *graphID, void *args);
+void *llvm_visc_streamPop(void *graphID);
+void llvm_visc_streamWait(void *graphID);
 }
 
-#endif //VISC_RT_HEADER
+#endif // VISC_RT_HEADER
diff --git a/hpvm/test/CTestSuite/gemm.c b/hpvm/test/CTestSuite/gemm.c
index 2a54b88828e26c916b011b68455ad349f5929599..d0a69ba25c27fb65ea549023deed2dfb0197b882 100644
--- a/hpvm/test/CTestSuite/gemm.c
+++ b/hpvm/test/CTestSuite/gemm.c
@@ -1,6 +1,6 @@
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
 #define WA 1024
@@ -10,13 +10,11 @@
 #define WC WB
 #define HC HA
 
-
-
 // Thread block size
 #define BLOCK_SIZE 16
 
 // Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
+void randomInit(float *data, int size) {
   for (int i = 0; i < size; ++i)
     data[i] = rand() / (float)RAND_MAX;
 }
@@ -30,26 +28,25 @@ void randomInit(float* data, int size) {
 //////////////////////////////////////////////////////////////////////////////
 
 // Check bool
-int isEqual(float a, float b) {
-  return (fabs(a-b) < 0.001);
-}
+int isEqual(float a, float b) { return (fabs(a - b) < 0.001); }
 
 // Check Results
 
-__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) {
+__attribute__((noinline)) int checkResults(float *A, float *B, float *C) {
   unsigned int size_A = WA * HA;
   unsigned int size_B = WB * HB;
   unsigned int size_C = WC * HC;
   unsigned int bytesC = sizeof(float) * size_C;
-  float* goldC = (float*) malloc(bytesC);
-  for (int i=0; i < HC; i++) {
-    for (int j=0; j < WC; j++) {
-      goldC[i*WC + j] = 0;
-      for (int k=0; k < HB; k++) {
-        goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
+  float *goldC = (float *)malloc(bytesC);
+  for (int i = 0; i < HC; i++) {
+    for (int j = 0; j < WC; j++) {
+      goldC[i * WC + j] = 0;
+      for (int k = 0; k < HB; k++) {
+        goldC[i * WC + j] += A[i * WA + k] * B[k * WB + j];
       }
-      if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
-        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
+      if (!isEqual(goldC[i * WC + j], C[i * WC + j])) {
+        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j,
+               C[i * WC + j], goldC[i * WC + j]);
         return 0;
       }
     }
@@ -58,36 +55,38 @@ __attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) {
 }
 
 // Dummy visc node execution call
-//void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* outputs);
+// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned),
+// int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void*
+// outputs);
+
+void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) {
 
-void matrixMul(float* A, float* B, float* C, unsigned k, unsigned n) {
-  
   __visc__attributes(2, A, B, 1, C);
-  //printf("Entered function\n");
-  int tx = get_local_id(0); //2D Global Thread ID x
-  int ty = get_local_id(1); //2D Global Thread ID y
-  //int tx = get_global_id(0); //2D Global Thread ID x
-  //int ty = get_global_id(1); //2D Global Thread ID y
+  // printf("Entered function\n");
+  int tx = get_local_id(0); // 2D Global Thread ID x
+  int ty = get_local_id(1); // 2D Global Thread ID y
+  // int tx = get_global_id(0); //2D Global Thread ID x
+  // int ty = get_global_id(1); //2D Global Thread ID y
 
-  //printf("Computing element (%d, %d)\n", tx, ty);
+  // printf("Computing element (%d, %d)\n", tx, ty);
   // Initialize accumulator
   float res = 0.0f;
 
   // Perform dot-product of row-column
   for (int i = 0; i < k; i++) {
-    //printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx);
-    res += A[ty*k+i] * B[i*n+tx];
+    // printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx);
+    res += A[ty * k + i] * B[i * n + tx];
   }
 
-  //printf("Result computed\n");
+  // printf("Result computed\n");
   // Write in device memory
-  C[ty*n+tx] = res;
+  C[ty * n + tx] = res;
 
-  //printf("Result written to C\n");
+  // printf("Result written to C\n");
 }
 
 // Main
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
 
   // seed for rand()
   srand(2006);
@@ -95,46 +94,47 @@ int main(int argc, char** argv) {
   // Allocate host memory for matrices A and B
   unsigned int size_A = WA * HA;
   size_t bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
+  float *h_A = (float *)malloc(bytes_A);
 
   unsigned int size_B = WB * HB;
   size_t bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
-
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/*
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
-
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
+  float *h_B = (float *)malloc(bytes_B);
+
+  // Initialize host memory
+  randomInit(h_A, size_A);
+  randomInit(h_B, size_B);
+
+  /*
+     // Print A and B
+     printf("\n\nMatrix A\n");
+     for(int i = 0; i < size_A; i++)
+     {
+        printf("%f ", h_A[i]);
+        if(((i + 1) % WA) == 0)
+        printf("\n");
+     }
+
+     printf("\n\nMatrix B\n");
+     for(int i = 0; i < size_B; i++)
+     {
+        printf("%f ", h_B[i]);
+        if(((i + 1) % WB) == 0)
+        printf("\n");
+     }
+  */
 
   // Allocate host memory for the result matrix C
   unsigned int size_C = WC * HC;
   size_t bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
+  float *h_C = (float *)malloc(bytes_C);
 
-   // Compute using OpenCL
-  //matrixMul(h_A, h_B, h_C, WA, WB);
+  // Compute using OpenCL
+  // matrixMul(h_A, h_B, h_C, WA, WB);
   //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
-  unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0);
+  unsigned graphMM = __visc__node(matrixMul, 1, 2, WB, HA, 8, h_A, bytes_A, h_B,
+                                  bytes_B, h_C, bytes_C, WA, WB, 0);
   __visc__wait(graphMM);
-  if(checkResults(h_A, h_B, h_C))
+  if (checkResults(h_A, h_B, h_C))
     printf("\nPass!\n");
   else
     printf("\nFailed!\n");
@@ -145,4 +145,3 @@ int main(int argc, char** argv) {
   free(h_B);
   free(h_C);
 }
-
diff --git a/hpvm/test/CTestSuite/gemm_2.c b/hpvm/test/CTestSuite/gemm_2.c
index aab0168bef72b00d036e450007a50d55727e515a..bd7ab27fc0160275442d23faf507851b7c2369f7 100644
--- a/hpvm/test/CTestSuite/gemm_2.c
+++ b/hpvm/test/CTestSuite/gemm_2.c
@@ -1,6 +1,6 @@
-#include <stdlib.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 
 #define WA 1024
@@ -10,13 +10,11 @@
 #define WC WB
 #define HC HA
 
-
-
 // Thread block size
 #define BLOCK_SIZE 16
 
 // Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
+void randomInit(float *data, int size) {
   for (int i = 0; i < size; ++i)
     data[i] = rand() / (float)RAND_MAX;
 }
@@ -30,26 +28,25 @@ void randomInit(float* data, int size) {
 //////////////////////////////////////////////////////////////////////////////
 
 // Check bool
-int isEqual(float a, float b) {
-  return (fabs(a-b) < 0.001);
-}
+int isEqual(float a, float b) { return (fabs(a - b) < 0.001); }
 
 // Check Results
 
-__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) {
+__attribute__((noinline)) int checkResults(float *A, float *B, float *C) {
   unsigned int size_A = WA * HA;
   unsigned int size_B = WB * HB;
   unsigned int size_C = WC * HC;
   unsigned int bytesC = sizeof(float) * size_C;
-  float* goldC = (float*) malloc(bytesC);
-  for (int i=0; i < HC; i++) {
-    for (int j=0; j < WC; j++) {
-      goldC[i*WC + j] = 0;
-      for (int k=0; k < HB; k++) {
-        goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
+  float *goldC = (float *)malloc(bytesC);
+  for (int i = 0; i < HC; i++) {
+    for (int j = 0; j < WC; j++) {
+      goldC[i * WC + j] = 0;
+      for (int k = 0; k < HB; k++) {
+        goldC[i * WC + j] += A[i * WA + k] * B[k * WB + j];
       }
-      if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
-        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
+      if (!isEqual(goldC[i * WC + j], C[i * WC + j])) {
+        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j,
+               C[i * WC + j], goldC[i * WC + j]);
         return 0;
       }
     }
@@ -58,36 +55,38 @@ __attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) {
 }
 
 // Dummy visc node execution call
-//void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned), int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void* outputs);
+// void __visc__node(void kernel (float*, float*, float*, unsigned, unsigned),
+// int numDims, void* dims, int numInputs, void* inputs, int numOutputs, void*
+// outputs);
 
-void matrixMul( float* A, float* B, float* C, unsigned k, unsigned n) {
+void matrixMul(float *A, float *B, float *C, unsigned k, unsigned n) {
   __visc__attributes(2, A, B, 1, C);
 
-  //printf("Entered function\n");
-  int tx = get_global_id(0); //2D Global Thread ID x
-  int ty = get_global_id(1); //2D Global Thread ID y
-  //int tx = get_global_id(0); //2D Global Thread ID x
-  //int ty = get_global_id(1); //2D Global Thread ID y
+  // printf("Entered function\n");
+  int tx = get_global_id(0); // 2D Global Thread ID x
+  int ty = get_global_id(1); // 2D Global Thread ID y
+  // int tx = get_global_id(0); //2D Global Thread ID x
+  // int ty = get_global_id(1); //2D Global Thread ID y
 
-  //printf("Computing element (%d, %d)\n", tx, ty);
+  // printf("Computing element (%d, %d)\n", tx, ty);
   // Initialize accumulator
   float res = 0.0f;
 
   // Perform dot-product of row-column
   for (int i = 0; i < k; i++) {
-    //printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx);
-    res += A[ty*k+i] * B[i*n+tx];
+    // printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx);
+    res += A[ty * k + i] * B[i * n + tx];
   }
 
-  //printf("Result computed\n");
+  // printf("Result computed\n");
   // Write in device memory
-  C[ty*n+tx] = res;
+  C[ty * n + tx] = res;
 
-  //printf("Result written to C\n");
+  // printf("Result written to C\n");
 }
 
 // Main
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
 
   // seed for rand()
   srand(2006);
@@ -95,46 +94,48 @@ int main(int argc, char** argv) {
   // Allocate host memory for matrices A and B
   unsigned int size_A = WA * HA;
   size_t bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
+  float *h_A = (float *)malloc(bytes_A);
 
   unsigned int size_B = WB * HB;
   size_t bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
-
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/*
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
-
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
+  float *h_B = (float *)malloc(bytes_B);
+
+  // Initialize host memory
+  randomInit(h_A, size_A);
+  randomInit(h_B, size_B);
+
+  /*
+     // Print A and B
+     printf("\n\nMatrix A\n");
+     for(int i = 0; i < size_A; i++)
+     {
+        printf("%f ", h_A[i]);
+        if(((i + 1) % WA) == 0)
+        printf("\n");
+     }
+
+     printf("\n\nMatrix B\n");
+     for(int i = 0; i < size_B; i++)
+     {
+        printf("%f ", h_B[i]);
+        if(((i + 1) % WB) == 0)
+        printf("\n");
+     }
+  */
 
   // Allocate host memory for the result matrix C
   unsigned int size_C = WC * HC;
   size_t bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
+  float *h_C = (float *)malloc(bytes_C);
 
-   // Compute using OpenCL
-  //matrixMul(h_A, h_B, h_C, WA, WB);
+  // Compute using OpenCL
+  // matrixMul(h_A, h_B, h_C, WA, WB);
   //__visc__node(matrixMul, 2, WB, HA, 3, h_A, h_B, h_C, 0);
-  unsigned graphMM = __visc__node(matrixMul, 2, 2, 16, 16, WB/16, HA/16, 8, h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, 0);
+  unsigned graphMM =
+      __visc__node(matrixMul, 2, 2, 16, 16, WB / 16, HA / 16, 8, h_A, bytes_A,
+                   h_B, bytes_B, h_C, bytes_C, WA, WB, 0);
   __visc__wait(graphMM);
-  if(checkResults(h_A, h_B, h_C))
+  if (checkResults(h_A, h_B, h_C))
     printf("\nPass!\n");
   else
     printf("\nFailed!\n");
@@ -145,4 +146,3 @@ int main(int argc, char** argv) {
   free(h_B);
   free(h_C);
 }
-
diff --git a/hpvm/test/hpvm-cava/scripts/gamut_map.cc b/hpvm/test/hpvm-cava/scripts/gamut_map.cc
index ba835f086895f58dc77a2f206590fe806b9010d2..ef5162e120aa95d0e56c6c6142770dc6503f8ce4 100644
--- a/hpvm/test/hpvm-cava/scripts/gamut_map.cc
+++ b/hpvm/test/hpvm-cava/scripts/gamut_map.cc
@@ -1,50 +1,41 @@
-#include <iostream>
 #include <cmath>
+#include <iostream>
 
 #include "gamut_map.h"
 
-void gamut_map(float* input,
-               int row_size,
-               int col_size,
-               int chan_size,
-               float* result,
-               float* ctrl_pts,
-               float* weights,
-               float* coefs,
+void gamut_map(float *input, int row_size, int col_size, int chan_size,
+               float *result, float *ctrl_pts, float *weights, float *coefs,
                int num_cps) {
 
-    ARRAY_3D(float, _input, input, col_size, chan_size);
-    ARRAY_3D(float, _result, result, col_size, chan_size);
-    ARRAY_2D(float, _ctrl_pts, ctrl_pts, chan_size);
-    ARRAY_2D(float, _weights, weights, chan_size);
-    ARRAY_2D(float, _coefs, coefs, chan_size);
+  ARRAY_3D(float, _input, input, col_size, chan_size);
+  ARRAY_3D(float, _result, result, col_size, chan_size);
+  ARRAY_2D(float, _ctrl_pts, ctrl_pts, chan_size);
+  ARRAY_2D(float, _weights, weights, chan_size);
+  ARRAY_2D(float, _coefs, coefs, chan_size);
 
-    float* l2_dist = new float[num_cps];
-    for (int row = 0; row < row_size; row++) {
-        for (int col = 0; col < col_size; col++) {
-            for (int cp = 0; cp < num_cps; cp++) {
-                l2_dist[cp] =
-                        sqrt((_input[row][col][0] - _ctrl_pts[cp][0]) *
-                                     (_input[row][col][0] - _ctrl_pts[cp][0]) +
-                             (_input[row][col][1] - _ctrl_pts[cp][1]) *
-                                     (_input[row][col][1] - _ctrl_pts[cp][1]) +
-                             (_input[row][col][2] - _ctrl_pts[cp][2]) *
-                                     (_input[row][col][2] - _ctrl_pts[cp][2]));
-            }
-            for (int chan = 0; chan < chan_size; chan++) {
-                float chan_val = 0.0;
-                for (int cp = 0; cp < num_cps; cp++) {
-                    chan_val += l2_dist[cp] * _weights[cp][chan];
-                }
-                // Add on the biases for the RBF
-                chan_val += _coefs[0][chan] +
-                            _coefs[1][chan] * _input[row][col][0] +
-                            _coefs[2][chan] * _input[row][col][1] +
-                            _coefs[3][chan] * _input[row][col][2];
-                _result[row][col][chan] = (chan_val > 0) ? chan_val : 0;
-            }
+  float *l2_dist = new float[num_cps];
+  for (int row = 0; row < row_size; row++) {
+    for (int col = 0; col < col_size; col++) {
+      for (int cp = 0; cp < num_cps; cp++) {
+        l2_dist[cp] = sqrt((_input[row][col][0] - _ctrl_pts[cp][0]) *
+                               (_input[row][col][0] - _ctrl_pts[cp][0]) +
+                           (_input[row][col][1] - _ctrl_pts[cp][1]) *
+                               (_input[row][col][1] - _ctrl_pts[cp][1]) +
+                           (_input[row][col][2] - _ctrl_pts[cp][2]) *
+                               (_input[row][col][2] - _ctrl_pts[cp][2]));
+      }
+      for (int chan = 0; chan < chan_size; chan++) {
+        float chan_val = 0.0;
+        for (int cp = 0; cp < num_cps; cp++) {
+          chan_val += l2_dist[cp] * _weights[cp][chan];
         }
+        // Add on the biases for the RBF
+        chan_val += _coefs[0][chan] + _coefs[1][chan] * _input[row][col][0] +
+                    _coefs[2][chan] * _input[row][col][1] +
+                    _coefs[3][chan] * _input[row][col][2];
+        _result[row][col][chan] = (chan_val > 0) ? chan_val : 0;
+      }
     }
-    delete l2_dist;
+  }
+  delete l2_dist;
 }
-
diff --git a/hpvm/test/hpvm-cava/scripts/gamut_map.h b/hpvm/test/hpvm-cava/scripts/gamut_map.h
index cd8d2ea9ae90bd71651193d7752fe97656d9617a..2742218fcb364d0eb6fe81666cca2921005b3007 100644
--- a/hpvm/test/hpvm-cava/scripts/gamut_map.h
+++ b/hpvm/test/hpvm-cava/scripts/gamut_map.h
@@ -2,20 +2,14 @@
 #define GAMUT_MAP_H
 
 #define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1)             \
-    TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
+  TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
 
 #define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2)      \
-    TYPE(*output_array_name)[DIM_1][DIM_2] =                                   \
-        (TYPE(*)[DIM_1][DIM_2])input_array_name
+  TYPE(*output_array_name)                                                     \
+  [DIM_1][DIM_2] = (TYPE(*)[DIM_1][DIM_2])input_array_name
 
-void gamut_map(float* input,
-               int row_size,
-               int col_size,
-               int chan_size,
-               float* result,
-               float* ctrl_pts,
-               float* weights,
-               float* coefs,
+void gamut_map(float *input, int row_size, int col_size, int chan_size,
+               float *result, float *ctrl_pts, float *weights, float *coefs,
                int num_cps);
 
 #endif
diff --git a/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc b/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc
index 21864f4abc4c78ba41d42a984871894fcbd17271..a9efa8ff6e7e312cc46cae3442597d4faca16130 100644
--- a/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc
+++ b/hpvm/test/hpvm-cava/scripts/gamut_map_wrap.cc
@@ -3,11 +3,11 @@
  * Version 3.0.8
  *
  * This file is not intended to be easily readable and contains a number of
- * coding conventions designed to improve portability and efficiency. Do not make
- * changes to this file unless you know what you are doing--modify the SWIG
+ * coding conventions designed to improve portability and efficiency. Do not
+ * make changes to this file unless you know what you are doing--modify the SWIG
  * interface file instead.
- * ----------------------------------------------------------------------------- */
-
+ * -----------------------------------------------------------------------------
+ */
 
 #ifndef SWIGPYTHON
 #define SWIGPYTHON
@@ -18,114 +18,124 @@
 /* -----------------------------------------------------------------------------
  *  This section contains generic SWIG labels for method/variable
  *  declarations/attributes, and other compiler dependent labels.
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
-/* template workaround for compilers that cannot correctly implement the C++ standard */
+/* template workaround for compilers that cannot correctly implement the C++
+ * standard */
 #ifndef SWIGTEMPLATEDISAMBIGUATOR
-# if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x560)
-#  define SWIGTEMPLATEDISAMBIGUATOR template
-# elif defined(__HP_aCC)
-/* Needed even with `aCC -AA' when `aCC -V' reports HP ANSI C++ B3910B A.03.55 */
-/* If we find a maximum version that requires this, the test would be __HP_aCC <= 35500 for A.03.55 */
-#  define SWIGTEMPLATEDISAMBIGUATOR template
-# else
-#  define SWIGTEMPLATEDISAMBIGUATOR
-# endif
+#if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x560)
+#define SWIGTEMPLATEDISAMBIGUATOR template
+#elif defined(__HP_aCC)
+/* Needed even with `aCC -AA' when `aCC -V' reports HP ANSI C++ B3910B A.03.55
+ */
+/* If we find a maximum version that requires this, the test would be __HP_aCC
+ * <= 35500 for A.03.55 */
+#define SWIGTEMPLATEDISAMBIGUATOR template
+#else
+#define SWIGTEMPLATEDISAMBIGUATOR
+#endif
 #endif
 
 /* inline attribute */
 #ifndef SWIGINLINE
-# if defined(__cplusplus) || (defined(__GNUC__) && !defined(__STRICT_ANSI__))
-#   define SWIGINLINE inline
-# else
-#   define SWIGINLINE
-# endif
+#if defined(__cplusplus) || (defined(__GNUC__) && !defined(__STRICT_ANSI__))
+#define SWIGINLINE inline
+#else
+#define SWIGINLINE
+#endif
 #endif
 
 /* attribute recognised by some compilers to avoid 'unused' warnings */
 #ifndef SWIGUNUSED
-# if defined(__GNUC__)
-#   if !(defined(__cplusplus)) || (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
-#     define SWIGUNUSED __attribute__ ((__unused__))
-#   else
-#     define SWIGUNUSED
-#   endif
-# elif defined(__ICC)
-#   define SWIGUNUSED __attribute__ ((__unused__))
-# else
-#   define SWIGUNUSED
-# endif
+#if defined(__GNUC__)
+#if !(defined(__cplusplus)) ||                                                 \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4))
+#define SWIGUNUSED __attribute__((__unused__))
+#else
+#define SWIGUNUSED
+#endif
+#elif defined(__ICC)
+#define SWIGUNUSED __attribute__((__unused__))
+#else
+#define SWIGUNUSED
+#endif
 #endif
 
 #ifndef SWIG_MSC_UNSUPPRESS_4505
-# if defined(_MSC_VER)
-#   pragma warning(disable : 4505) /* unreferenced local function has been removed */
-# endif
+#if defined(_MSC_VER)
+#pragma warning(                                                               \
+    disable : 4505) /* unreferenced local function has been removed */
+#endif
 #endif
 
 #ifndef SWIGUNUSEDPARM
-# ifdef __cplusplus
-#   define SWIGUNUSEDPARM(p)
-# else
-#   define SWIGUNUSEDPARM(p) p SWIGUNUSED
-# endif
+#ifdef __cplusplus
+#define SWIGUNUSEDPARM(p)
+#else
+#define SWIGUNUSEDPARM(p) p SWIGUNUSED
+#endif
 #endif
 
 /* internal SWIG method */
 #ifndef SWIGINTERN
-# define SWIGINTERN static SWIGUNUSED
+#define SWIGINTERN static SWIGUNUSED
 #endif
 
 /* internal inline SWIG method */
 #ifndef SWIGINTERNINLINE
-# define SWIGINTERNINLINE SWIGINTERN SWIGINLINE
+#define SWIGINTERNINLINE SWIGINTERN SWIGINLINE
 #endif
 
 /* exporting methods */
 #if (__GNUC__ >= 4) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)
-#  ifndef GCC_HASCLASSVISIBILITY
-#    define GCC_HASCLASSVISIBILITY
-#  endif
+#ifndef GCC_HASCLASSVISIBILITY
+#define GCC_HASCLASSVISIBILITY
+#endif
 #endif
 
 #ifndef SWIGEXPORT
-# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
-#   if defined(STATIC_LINKED)
-#     define SWIGEXPORT
-#   else
-#     define SWIGEXPORT __declspec(dllexport)
-#   endif
-# else
-#   if defined(__GNUC__) && defined(GCC_HASCLASSVISIBILITY)
-#     define SWIGEXPORT __attribute__ ((visibility("default")))
-#   else
-#     define SWIGEXPORT
-#   endif
-# endif
+#if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
+#if defined(STATIC_LINKED)
+#define SWIGEXPORT
+#else
+#define SWIGEXPORT __declspec(dllexport)
+#endif
+#else
+#if defined(__GNUC__) && defined(GCC_HASCLASSVISIBILITY)
+#define SWIGEXPORT __attribute__((visibility("default")))
+#else
+#define SWIGEXPORT
+#endif
+#endif
 #endif
 
 /* calling conventions for Windows */
 #ifndef SWIGSTDCALL
-# if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
-#   define SWIGSTDCALL __stdcall
-# else
-#   define SWIGSTDCALL
-# endif
+#if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
+#define SWIGSTDCALL __stdcall
+#else
+#define SWIGSTDCALL
+#endif
 #endif
 
 /* Deal with Microsoft's attempt at deprecating C standard runtime functions */
-#if !defined(SWIG_NO_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_CRT_SECURE_NO_DEPRECATE)
-# define _CRT_SECURE_NO_DEPRECATE
+#if !defined(SWIG_NO_CRT_SECURE_NO_DEPRECATE) && defined(_MSC_VER) &&          \
+    !defined(_CRT_SECURE_NO_DEPRECATE)
+#define _CRT_SECURE_NO_DEPRECATE
 #endif
 
-/* Deal with Microsoft's attempt at deprecating methods in the standard C++ library */
-#if !defined(SWIG_NO_SCL_SECURE_NO_DEPRECATE) && defined(_MSC_VER) && !defined(_SCL_SECURE_NO_DEPRECATE)
-# define _SCL_SECURE_NO_DEPRECATE
+/* Deal with Microsoft's attempt at deprecating methods in the standard C++
+ * library */
+#if !defined(SWIG_NO_SCL_SECURE_NO_DEPRECATE) && defined(_MSC_VER) &&          \
+    !defined(_SCL_SECURE_NO_DEPRECATE)
+#define _SCL_SECURE_NO_DEPRECATE
 #endif
 
 /* Deal with Apple's deprecated 'AssertMacros.h' from Carbon-framework */
-#if defined(__APPLE__) && !defined(__ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES)
-# define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0
+#if defined(__APPLE__) &&                                                      \
+    !defined(__ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES)
+#define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0
 #endif
 
 /* Intel's compiler complains if a variable which was never initialised is
@@ -134,17 +144,16 @@
  * See: https://github.com/swig/swig/issues/192 for more discussion.
  */
 #ifdef __INTEL_COMPILER
-# pragma warning disable 592
+#pragma warning disable 592
 #endif
 
-
 #if defined(_DEBUG) && defined(SWIG_PYTHON_INTERPRETER_NO_DEBUG)
 /* Use debug wrappers with the Python release dll */
-# undef _DEBUG
-# include <Python.h>
-# define _DEBUG
+#undef _DEBUG
+#include <Python.h>
+#define _DEBUG
 #else
-# include <Python.h>
+#include <Python.h>
 #endif
 
 /* -----------------------------------------------------------------------------
@@ -152,19 +161,20 @@
  *
  * This file contains generic C API SWIG runtime support for pointer
  * type checking.
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
-/* This should only be incremented when either the layout of swig_type_info changes,
-   or for whatever reason, the runtime changes incompatibly */
+/* This should only be incremented when either the layout of swig_type_info
+   changes, or for whatever reason, the runtime changes incompatibly */
 #define SWIG_RUNTIME_VERSION "4"
 
 /* define SWIG_TYPE_TABLE_NAME as "SWIG_TYPE_TABLE" */
 #ifdef SWIG_TYPE_TABLE
-# define SWIG_QUOTE_STRING(x) #x
-# define SWIG_EXPAND_AND_QUOTE_STRING(x) SWIG_QUOTE_STRING(x)
-# define SWIG_TYPE_TABLE_NAME SWIG_EXPAND_AND_QUOTE_STRING(SWIG_TYPE_TABLE)
+#define SWIG_QUOTE_STRING(x) #x
+#define SWIG_EXPAND_AND_QUOTE_STRING(x) SWIG_QUOTE_STRING(x)
+#define SWIG_TYPE_TABLE_NAME SWIG_EXPAND_AND_QUOTE_STRING(SWIG_TYPE_TABLE)
 #else
-# define SWIG_TYPE_TABLE_NAME
+#define SWIG_TYPE_TABLE_NAME
 #endif
 
 /*
@@ -177,25 +187,24 @@
 */
 
 #ifndef SWIGRUNTIME
-# define SWIGRUNTIME SWIGINTERN
+#define SWIGRUNTIME SWIGINTERN
 #endif
 
 #ifndef SWIGRUNTIMEINLINE
-# define SWIGRUNTIMEINLINE SWIGRUNTIME SWIGINLINE
+#define SWIGRUNTIMEINLINE SWIGRUNTIME SWIGINLINE
 #endif
 
 /*  Generic buffer size */
 #ifndef SWIG_BUFFER_SIZE
-# define SWIG_BUFFER_SIZE 1024
+#define SWIG_BUFFER_SIZE 1024
 #endif
 
 /* Flags for pointer conversions */
-#define SWIG_POINTER_DISOWN        0x1
-#define SWIG_CAST_NEW_MEMORY       0x2
+#define SWIG_POINTER_DISOWN 0x1
+#define SWIG_CAST_NEW_MEMORY 0x2
 
 /* Flags for new pointer objects */
-#define SWIG_POINTER_OWN           0x1
-
+#define SWIG_POINTER_OWN 0x1
 
 /*
    Flags/methods for returning states.
@@ -232,7 +241,7 @@
       // success code
       if (SWIG_IsNewObj(res) {
         ...
-	delete *ptr;
+        delete *ptr;
       } else {
         ...
       }
@@ -258,9 +267,9 @@
         }
       }
 
-   Of course, returning the plain '0(success)/-1(fail)' still works, but you can be
-   more explicit by returning SWIG_BADOBJ, SWIG_ERROR or any of the
-   SWIG errors code.
+   Of course, returning the plain '0(success)/-1(fail)' still works, but you can
+   be more explicit by returning SWIG_BADOBJ, SWIG_ERROR or any of the SWIG
+   errors code.
 
    Finally, if the SWIG_CASTRANK_MODE is enabled, the result code
    allows to return the 'cast rank', for example, if you have this
@@ -276,52 +285,53 @@
    just use the SWIG_AddCast()/SWIG_CheckState()
 */
 
-#define SWIG_OK                    (0)
-#define SWIG_ERROR                 (-1)
-#define SWIG_IsOK(r)               (r >= 0)
-#define SWIG_ArgError(r)           ((r != SWIG_ERROR) ? r : SWIG_TypeError)
+#define SWIG_OK (0)
+#define SWIG_ERROR (-1)
+#define SWIG_IsOK(r) (r >= 0)
+#define SWIG_ArgError(r) ((r != SWIG_ERROR) ? r : SWIG_TypeError)
 
 /* The CastRankLimit says how many bits are used for the cast rank */
-#define SWIG_CASTRANKLIMIT         (1 << 8)
+#define SWIG_CASTRANKLIMIT (1 << 8)
 /* The NewMask denotes the object was created (using new/malloc) */
-#define SWIG_NEWOBJMASK            (SWIG_CASTRANKLIMIT  << 1)
+#define SWIG_NEWOBJMASK (SWIG_CASTRANKLIMIT << 1)
 /* The TmpMask is for in/out typemaps that use temporal objects */
-#define SWIG_TMPOBJMASK            (SWIG_NEWOBJMASK << 1)
+#define SWIG_TMPOBJMASK (SWIG_NEWOBJMASK << 1)
 /* Simple returning values */
-#define SWIG_BADOBJ                (SWIG_ERROR)
-#define SWIG_OLDOBJ                (SWIG_OK)
-#define SWIG_NEWOBJ                (SWIG_OK | SWIG_NEWOBJMASK)
-#define SWIG_TMPOBJ                (SWIG_OK | SWIG_TMPOBJMASK)
+#define SWIG_BADOBJ (SWIG_ERROR)
+#define SWIG_OLDOBJ (SWIG_OK)
+#define SWIG_NEWOBJ (SWIG_OK | SWIG_NEWOBJMASK)
+#define SWIG_TMPOBJ (SWIG_OK | SWIG_TMPOBJMASK)
 /* Check, add and del mask methods */
-#define SWIG_AddNewMask(r)         (SWIG_IsOK(r) ? (r | SWIG_NEWOBJMASK) : r)
-#define SWIG_DelNewMask(r)         (SWIG_IsOK(r) ? (r & ~SWIG_NEWOBJMASK) : r)
-#define SWIG_IsNewObj(r)           (SWIG_IsOK(r) && (r & SWIG_NEWOBJMASK))
-#define SWIG_AddTmpMask(r)         (SWIG_IsOK(r) ? (r | SWIG_TMPOBJMASK) : r)
-#define SWIG_DelTmpMask(r)         (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r)
-#define SWIG_IsTmpObj(r)           (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK))
+#define SWIG_AddNewMask(r) (SWIG_IsOK(r) ? (r | SWIG_NEWOBJMASK) : r)
+#define SWIG_DelNewMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_NEWOBJMASK) : r)
+#define SWIG_IsNewObj(r) (SWIG_IsOK(r) && (r & SWIG_NEWOBJMASK))
+#define SWIG_AddTmpMask(r) (SWIG_IsOK(r) ? (r | SWIG_TMPOBJMASK) : r)
+#define SWIG_DelTmpMask(r) (SWIG_IsOK(r) ? (r & ~SWIG_TMPOBJMASK) : r)
+#define SWIG_IsTmpObj(r) (SWIG_IsOK(r) && (r & SWIG_TMPOBJMASK))
 
 /* Cast-Rank Mode */
 #if defined(SWIG_CASTRANK_MODE)
-#  ifndef SWIG_TypeRank
-#    define SWIG_TypeRank             unsigned long
-#  endif
-#  ifndef SWIG_MAXCASTRANK            /* Default cast allowed */
-#    define SWIG_MAXCASTRANK          (2)
-#  endif
-#  define SWIG_CASTRANKMASK          ((SWIG_CASTRANKLIMIT) -1)
-#  define SWIG_CastRank(r)           (r & SWIG_CASTRANKMASK)
+#ifndef SWIG_TypeRank
+#define SWIG_TypeRank unsigned long
+#endif
+#ifndef SWIG_MAXCASTRANK /* Default cast allowed */
+#define SWIG_MAXCASTRANK (2)
+#endif
+#define SWIG_CASTRANKMASK ((SWIG_CASTRANKLIMIT)-1)
+#define SWIG_CastRank(r) (r & SWIG_CASTRANKMASK)
 SWIGINTERNINLINE int SWIG_AddCast(int r) {
-  return SWIG_IsOK(r) ? ((SWIG_CastRank(r) < SWIG_MAXCASTRANK) ? (r + 1) : SWIG_ERROR) : r;
+  return SWIG_IsOK(r)
+             ? ((SWIG_CastRank(r) < SWIG_MAXCASTRANK) ? (r + 1) : SWIG_ERROR)
+             : r;
 }
 SWIGINTERNINLINE int SWIG_CheckState(int r) {
   return SWIG_IsOK(r) ? SWIG_CastRank(r) + 1 : 0;
 }
 #else /* no cast-rank mode */
-#  define SWIG_AddCast(r) (r)
-#  define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0)
+#define SWIG_AddCast(r) (r)
+#define SWIG_CheckState(r) (SWIG_IsOK(r) ? 1 : 0)
 #endif
 
-
 #include <string.h>
 
 #ifdef __cplusplus
@@ -333,32 +343,37 @@ typedef struct swig_type_info *(*swig_dycast_func)(void **);
 
 /* Structure to store information on one type */
 typedef struct swig_type_info {
-  const char             *name;			/* mangled name of this type */
-  const char             *str;			/* human readable name of this type */
-  swig_dycast_func        dcast;		/* dynamic cast function down a hierarchy */
-  struct swig_cast_info  *cast;			/* linked list of types that can cast into this type */
-  void                   *clientdata;		/* language specific type data */
-  int                    owndata;		/* flag if the structure owns the clientdata */
+  const char *name;       /* mangled name of this type */
+  const char *str;        /* human readable name of this type */
+  swig_dycast_func dcast; /* dynamic cast function down a hierarchy */
+  struct swig_cast_info
+      *cast;        /* linked list of types that can cast into this type */
+  void *clientdata; /* language specific type data */
+  int owndata;      /* flag if the structure owns the clientdata */
 } swig_type_info;
 
 /* Structure to store a type and conversion function used for casting */
 typedef struct swig_cast_info {
-  swig_type_info         *type;			/* pointer to type that is equivalent to this type */
-  swig_converter_func     converter;		/* function to cast the void pointers */
-  struct swig_cast_info  *next;			/* pointer to next cast in linked list */
-  struct swig_cast_info  *prev;			/* pointer to the previous cast */
+  swig_type_info *type; /* pointer to type that is equivalent to this type */
+  swig_converter_func converter; /* function to cast the void pointers */
+  struct swig_cast_info *next;   /* pointer to next cast in linked list */
+  struct swig_cast_info *prev;   /* pointer to the previous cast */
 } swig_cast_info;
 
 /* Structure used to store module information
  * Each module generates one structure like this, and the runtime collects
  * all of these structures and stores them in a circularly linked list.*/
 typedef struct swig_module_info {
-  swig_type_info         **types;		/* Array of pointers to swig_type_info structures that are in this module */
-  size_t                 size;		        /* Number of types in this module */
-  struct swig_module_info *next;		/* Pointer to next element in circularly linked list */
-  swig_type_info         **type_initial;	/* Array of initially generated type structures */
-  swig_cast_info         **cast_initial;	/* Array of initially generated casting structures */
-  void                    *clientdata;		/* Language specific module data */
+  swig_type_info **types; /* Array of pointers to swig_type_info structures that
+                             are in this module */
+  size_t size;            /* Number of types in this module */
+  struct swig_module_info
+      *next; /* Pointer to next element in circularly linked list */
+  swig_type_info *
+      *type_initial; /* Array of initially generated type structures */
+  swig_cast_info *
+      *cast_initial; /* Array of initially generated casting structures */
+  void *clientdata;  /* Language specific module data */
 } swig_module_info;
 
 /*
@@ -368,13 +383,15 @@ typedef struct swig_module_info {
   Return 0 when the two name types are equivalent, as in
   strncmp, but skipping ' '.
 */
-SWIGRUNTIME int
-SWIG_TypeNameComp(const char *f1, const char *l1,
-		  const char *f2, const char *l2) {
-  for (;(f1 != l1) && (f2 != l2); ++f1, ++f2) {
-    while ((*f1 == ' ') && (f1 != l1)) ++f1;
-    while ((*f2 == ' ') && (f2 != l2)) ++f2;
-    if (*f1 != *f2) return (*f1 > *f2) ? 1 : -1;
+SWIGRUNTIME int SWIG_TypeNameComp(const char *f1, const char *l1,
+                                  const char *f2, const char *l2) {
+  for (; (f1 != l1) && (f2 != l2); ++f1, ++f2) {
+    while ((*f1 == ' ') && (f1 != l1))
+      ++f1;
+    while ((*f2 == ' ') && (f2 != l2))
+      ++f2;
+    if (*f1 != *f2)
+      return (*f1 > *f2) ? 1 : -1;
   }
   return (int)((l1 - f1) - (l2 - f2));
 }
@@ -383,17 +400,18 @@ SWIG_TypeNameComp(const char *f1, const char *l1,
   Check type equivalence in a name list like <name1>|<name2>|...
   Return 0 if equal, -1 if nb < tb, 1 if nb > tb
 */
-SWIGRUNTIME int
-SWIG_TypeCmp(const char *nb, const char *tb) {
+SWIGRUNTIME int SWIG_TypeCmp(const char *nb, const char *tb) {
   int equiv = 1;
-  const char* te = tb + strlen(tb);
-  const char* ne = nb;
+  const char *te = tb + strlen(tb);
+  const char *ne = nb;
   while (equiv != 0 && *ne) {
     for (nb = ne; *ne; ++ne) {
-      if (*ne == '|') break;
+      if (*ne == '|')
+        break;
     }
     equiv = SWIG_TypeNameComp(nb, ne, tb, te);
-    if (*ne) ++ne;
+    if (*ne)
+      ++ne;
   }
   return equiv;
 }
@@ -402,16 +420,14 @@ SWIG_TypeCmp(const char *nb, const char *tb) {
   Check type equivalence in a name list like <name1>|<name2>|...
   Return 0 if not equal, 1 if equal
 */
-SWIGRUNTIME int
-SWIG_TypeEquiv(const char *nb, const char *tb) {
+SWIGRUNTIME int SWIG_TypeEquiv(const char *nb, const char *tb) {
   return SWIG_TypeCmp(nb, tb) == 0 ? 1 : 0;
 }
 
 /*
   Check the typename
 */
-SWIGRUNTIME swig_cast_info *
-SWIG_TypeCheck(const char *c, swig_type_info *ty) {
+SWIGRUNTIME swig_cast_info *SWIG_TypeCheck(const char *c, swig_type_info *ty) {
   if (ty) {
     swig_cast_info *iter = ty->cast;
     while (iter) {
@@ -424,7 +440,8 @@ SWIG_TypeCheck(const char *c, swig_type_info *ty) {
           iter->next->prev = iter->prev;
         iter->next = ty->cast;
         iter->prev = 0;
-        if (ty->cast) ty->cast->prev = iter;
+        if (ty->cast)
+          ty->cast->prev = iter;
         ty->cast = iter;
         return iter;
       }
@@ -435,10 +452,11 @@ SWIG_TypeCheck(const char *c, swig_type_info *ty) {
 }
 
 /*
-  Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer comparison
+  Identical to SWIG_TypeCheck, except strcmp is replaced with a pointer
+  comparison
 */
-SWIGRUNTIME swig_cast_info *
-SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) {
+SWIGRUNTIME swig_cast_info *SWIG_TypeCheckStruct(swig_type_info *from,
+                                                 swig_type_info *ty) {
   if (ty) {
     swig_cast_info *iter = ty->cast;
     while (iter) {
@@ -451,7 +469,8 @@ SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) {
           iter->next->prev = iter->prev;
         iter->next = ty->cast;
         iter->prev = 0;
-        if (ty->cast) ty->cast->prev = iter;
+        if (ty->cast)
+          ty->cast->prev = iter;
         ty->cast = iter;
         return iter;
       }
@@ -464,21 +483,23 @@ SWIG_TypeCheckStruct(swig_type_info *from, swig_type_info *ty) {
 /*
   Cast a pointer up an inheritance hierarchy
 */
-SWIGRUNTIMEINLINE void *
-SWIG_TypeCast(swig_cast_info *ty, void *ptr, int *newmemory) {
+SWIGRUNTIMEINLINE void *SWIG_TypeCast(swig_cast_info *ty, void *ptr,
+                                      int *newmemory) {
   return ((!ty) || (!ty->converter)) ? ptr : (*ty->converter)(ptr, newmemory);
 }
 
 /*
    Dynamic pointer casting. Down an inheritance hierarchy
 */
-SWIGRUNTIME swig_type_info *
-SWIG_TypeDynamicCast(swig_type_info *ty, void **ptr) {
+SWIGRUNTIME swig_type_info *SWIG_TypeDynamicCast(swig_type_info *ty,
+                                                 void **ptr) {
   swig_type_info *lastty = ty;
-  if (!ty || !ty->dcast) return ty;
+  if (!ty || !ty->dcast)
+    return ty;
   while (ty && (ty->dcast)) {
     ty = (*ty->dcast)(ptr);
-    if (ty) lastty = ty;
+    if (ty)
+      lastty = ty;
   }
   return lastty;
 }
@@ -486,8 +507,7 @@ SWIG_TypeDynamicCast(swig_type_info *ty, void **ptr) {
 /*
   Return the name associated with this type
 */
-SWIGRUNTIMEINLINE const char *
-SWIG_TypeName(const swig_type_info *ty) {
+SWIGRUNTIMEINLINE const char *SWIG_TypeName(const swig_type_info *ty) {
   return ty->name;
 }
 
@@ -495,29 +515,28 @@ SWIG_TypeName(const swig_type_info *ty) {
   Return the pretty name associated with this type,
   that is an unmangled type name in a form presentable to the user.
 */
-SWIGRUNTIME const char *
-SWIG_TypePrettyName(const swig_type_info *type) {
+SWIGRUNTIME const char *SWIG_TypePrettyName(const swig_type_info *type) {
   /* The "str" field contains the equivalent pretty names of the
      type, separated by vertical-bar characters.  We choose
      to print the last name, as it is often (?) the most
      specific. */
-  if (!type) return NULL;
+  if (!type)
+    return NULL;
   if (type->str != NULL) {
     const char *last_name = type->str;
     const char *s;
     for (s = type->str; *s; s++)
-      if (*s == '|') last_name = s+1;
+      if (*s == '|')
+        last_name = s + 1;
     return last_name;
-  }
-  else
+  } else
     return type->name;
 }
 
 /*
    Set the clientdata field for a type
 */
-SWIGRUNTIME void
-SWIG_TypeClientData(swig_type_info *ti, void *clientdata) {
+SWIGRUNTIME void SWIG_TypeClientData(swig_type_info *ti, void *clientdata) {
   swig_cast_info *cast = ti->cast;
   /* if (ti->clientdata == clientdata) return; */
   ti->clientdata = clientdata;
@@ -526,14 +545,13 @@ SWIG_TypeClientData(swig_type_info *ti, void *clientdata) {
     if (!cast->converter) {
       swig_type_info *tc = cast->type;
       if (!tc->clientdata) {
-	SWIG_TypeClientData(tc, clientdata);
+        SWIG_TypeClientData(tc, clientdata);
       }
     }
     cast = cast->next;
   }
 }
-SWIGRUNTIME void
-SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) {
+SWIGRUNTIME void SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) {
   SWIG_TypeClientData(ti, clientdata);
   ti->owndata = 1;
 }
@@ -543,38 +561,37 @@ SWIG_TypeNewClientData(swig_type_info *ti, void *clientdata) {
   Search is a O(log #types)
 
   We start searching at module start, and finish searching when start == end.
-  Note: if start == end at the beginning of the function, we go all the way around
-  the circular list.
+  Note: if start == end at the beginning of the function, we go all the way
+  around the circular list.
 */
-SWIGRUNTIME swig_type_info *
-SWIG_MangledTypeQueryModule(swig_module_info *start,
-                            swig_module_info *end,
-		            const char *name) {
+SWIGRUNTIME swig_type_info *SWIG_MangledTypeQueryModule(swig_module_info *start,
+                                                        swig_module_info *end,
+                                                        const char *name) {
   swig_module_info *iter = start;
   do {
     if (iter->size) {
       size_t l = 0;
       size_t r = iter->size - 1;
       do {
-	/* since l+r >= 0, we can (>> 1) instead (/ 2) */
-	size_t i = (l + r) >> 1;
-	const char *iname = iter->types[i]->name;
-	if (iname) {
-	  int compare = strcmp(name, iname);
-	  if (compare == 0) {
-	    return iter->types[i];
-	  } else if (compare < 0) {
-	    if (i) {
-	      r = i - 1;
-	    } else {
-	      break;
-	    }
-	  } else if (compare > 0) {
-	    l = i + 1;
-	  }
-	} else {
-	  break; /* should never happen */
-	}
+        /* since l+r >= 0, we can (>> 1) instead (/ 2) */
+        size_t i = (l + r) >> 1;
+        const char *iname = iter->types[i]->name;
+        if (iname) {
+          int compare = strcmp(name, iname);
+          if (compare == 0) {
+            return iter->types[i];
+          } else if (compare < 0) {
+            if (i) {
+              r = i - 1;
+            } else {
+              break;
+            }
+          } else if (compare > 0) {
+            l = i + 1;
+          }
+        } else {
+          break; /* should never happen */
+        }
       } while (l <= r);
     }
     iter = iter->next;
@@ -583,18 +600,18 @@ SWIG_MangledTypeQueryModule(swig_module_info *start,
 }
 
 /*
-  Search for a swig_type_info structure for either a mangled name or a human readable name.
-  It first searches the mangled names of the types, which is a O(log #types)
-  If a type is not found it then searches the human readable names, which is O(#types).
+  Search for a swig_type_info structure for either a mangled name or a human
+  readable name. It first searches the mangled names of the types, which is a
+  O(log #types) If a type is not found it then searches the human readable
+  names, which is O(#types).
 
   We start searching at module start, and finish searching when start == end.
-  Note: if start == end at the beginning of the function, we go all the way around
-  the circular list.
+  Note: if start == end at the beginning of the function, we go all the way
+  around the circular list.
 */
-SWIGRUNTIME swig_type_info *
-SWIG_TypeQueryModule(swig_module_info *start,
-                     swig_module_info *end,
-		     const char *name) {
+SWIGRUNTIME swig_type_info *SWIG_TypeQueryModule(swig_module_info *start,
+                                                 swig_module_info *end,
+                                                 const char *name) {
   /* STEP 1: Search the name field using binary search */
   swig_type_info *ret = SWIG_MangledTypeQueryModule(start, end, name);
   if (ret) {
@@ -606,8 +623,8 @@ SWIG_TypeQueryModule(swig_module_info *start,
     do {
       size_t i = 0;
       for (; i < iter->size; ++i) {
-	if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name)))
-	  return iter->types[i];
+        if (iter->types[i]->str && (SWIG_TypeEquiv(iter->types[i]->str, name)))
+          return iter->types[i];
       }
       iter = iter->next;
     } while (iter != end);
@@ -620,11 +637,10 @@ SWIG_TypeQueryModule(swig_module_info *start,
 /*
    Pack binary data into a string
 */
-SWIGRUNTIME char *
-SWIG_PackData(char *c, void *ptr, size_t sz) {
+SWIGRUNTIME char *SWIG_PackData(char *c, void *ptr, size_t sz) {
   static const char hex[17] = "0123456789abcdef";
-  const unsigned char *u = (unsigned char *) ptr;
-  const unsigned char *eu =  u + sz;
+  const unsigned char *u = (unsigned char *)ptr;
+  const unsigned char *eu = u + sz;
   for (; u != eu; ++u) {
     unsigned char uu = *u;
     *(c++) = hex[(uu & 0xf0) >> 4];
@@ -636,9 +652,8 @@ SWIG_PackData(char *c, void *ptr, size_t sz) {
 /*
    Unpack binary data from a string
 */
-SWIGRUNTIME const char *
-SWIG_UnpackData(const char *c, void *ptr, size_t sz) {
-  unsigned char *u = (unsigned char *) ptr;
+SWIGRUNTIME const char *SWIG_UnpackData(const char *c, void *ptr, size_t sz) {
+  unsigned char *u = (unsigned char *)ptr;
   const unsigned char *eu = u + sz;
   for (; u != eu; ++u) {
     char d = *(c++);
@@ -646,16 +661,16 @@ SWIG_UnpackData(const char *c, void *ptr, size_t sz) {
     if ((d >= '0') && (d <= '9'))
       uu = ((d - '0') << 4);
     else if ((d >= 'a') && (d <= 'f'))
-      uu = ((d - ('a'-10)) << 4);
+      uu = ((d - ('a' - 10)) << 4);
     else
-      return (char *) 0;
+      return (char *)0;
     d = *(c++);
     if ((d >= '0') && (d <= '9'))
       uu |= (d - '0');
     else if ((d >= 'a') && (d <= 'f'))
-      uu |= (d - ('a'-10));
+      uu |= (d - ('a' - 10));
     else
-      return (char *) 0;
+      return (char *)0;
     *u = uu;
   }
   return c;
@@ -664,56 +679,59 @@ SWIG_UnpackData(const char *c, void *ptr, size_t sz) {
 /*
    Pack 'void *' into a string buffer.
 */
-SWIGRUNTIME char *
-SWIG_PackVoidPtr(char *buff, void *ptr, const char *name, size_t bsz) {
+SWIGRUNTIME char *SWIG_PackVoidPtr(char *buff, void *ptr, const char *name,
+                                   size_t bsz) {
   char *r = buff;
-  if ((2*sizeof(void *) + 2) > bsz) return 0;
+  if ((2 * sizeof(void *) + 2) > bsz)
+    return 0;
   *(r++) = '_';
-  r = SWIG_PackData(r,&ptr,sizeof(void *));
-  if (strlen(name) + 1 > (bsz - (r - buff))) return 0;
-  strcpy(r,name);
+  r = SWIG_PackData(r, &ptr, sizeof(void *));
+  if (strlen(name) + 1 > (bsz - (r - buff)))
+    return 0;
+  strcpy(r, name);
   return buff;
 }
 
-SWIGRUNTIME const char *
-SWIG_UnpackVoidPtr(const char *c, void **ptr, const char *name) {
+SWIGRUNTIME const char *SWIG_UnpackVoidPtr(const char *c, void **ptr,
+                                           const char *name) {
   if (*c != '_') {
-    if (strcmp(c,"NULL") == 0) {
-      *ptr = (void *) 0;
+    if (strcmp(c, "NULL") == 0) {
+      *ptr = (void *)0;
       return name;
     } else {
       return 0;
     }
   }
-  return SWIG_UnpackData(++c,ptr,sizeof(void *));
+  return SWIG_UnpackData(++c, ptr, sizeof(void *));
 }
 
-SWIGRUNTIME char *
-SWIG_PackDataName(char *buff, void *ptr, size_t sz, const char *name, size_t bsz) {
+SWIGRUNTIME char *SWIG_PackDataName(char *buff, void *ptr, size_t sz,
+                                    const char *name, size_t bsz) {
   char *r = buff;
   size_t lname = (name ? strlen(name) : 0);
-  if ((2*sz + 2 + lname) > bsz) return 0;
+  if ((2 * sz + 2 + lname) > bsz)
+    return 0;
   *(r++) = '_';
-  r = SWIG_PackData(r,ptr,sz);
+  r = SWIG_PackData(r, ptr, sz);
   if (lname) {
-    strncpy(r,name,lname+1);
+    strncpy(r, name, lname + 1);
   } else {
     *r = 0;
   }
   return buff;
 }
 
-SWIGRUNTIME const char *
-SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) {
+SWIGRUNTIME const char *SWIG_UnpackDataName(const char *c, void *ptr, size_t sz,
+                                            const char *name) {
   if (*c != '_') {
-    if (strcmp(c,"NULL") == 0) {
-      memset(ptr,0,sz);
+    if (strcmp(c, "NULL") == 0) {
+      memset(ptr, 0, sz);
       return name;
     } else {
       return 0;
     }
   }
-  return SWIG_UnpackData(++c,ptr,sz);
+  return SWIG_UnpackData(++c, ptr, sz);
 }
 
 #ifdef __cplusplus
@@ -721,21 +739,19 @@ SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) {
 #endif
 
 /*  Errors in SWIG */
-#define  SWIG_UnknownError    	   -1
-#define  SWIG_IOError        	   -2
-#define  SWIG_RuntimeError   	   -3
-#define  SWIG_IndexError     	   -4
-#define  SWIG_TypeError      	   -5
-#define  SWIG_DivisionByZero 	   -6
-#define  SWIG_OverflowError  	   -7
-#define  SWIG_SyntaxError    	   -8
-#define  SWIG_ValueError     	   -9
-#define  SWIG_SystemError    	   -10
-#define  SWIG_AttributeError 	   -11
-#define  SWIG_MemoryError    	   -12
-#define  SWIG_NullReferenceError   -13
-
-
+#define SWIG_UnknownError -1
+#define SWIG_IOError -2
+#define SWIG_RuntimeError -3
+#define SWIG_IndexError -4
+#define SWIG_TypeError -5
+#define SWIG_DivisionByZero -6
+#define SWIG_OverflowError -7
+#define SWIG_SyntaxError -8
+#define SWIG_ValueError -9
+#define SWIG_SystemError -10
+#define SWIG_AttributeError -11
+#define SWIG_MemoryError -12
+#define SWIG_NullReferenceError -13
 
 /* Compatibility macros for Python 3 */
 #if PY_VERSION_HEX >= 0x03000000
@@ -747,9 +763,9 @@ SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) {
 #define PyInt_FromSize_t(x) PyLong_FromSize_t(x)
 #define PyString_Check(name) PyBytes_Check(name)
 #define PyString_FromString(x) PyUnicode_FromString(x)
-#define PyString_Format(fmt, args)  PyUnicode_Format(fmt, args)
+#define PyString_Format(fmt, args) PyUnicode_Format(fmt, args)
 #define PyString_AsString(str) PyBytes_AsString(str)
-#define PyString_Size(str) PyBytes_Size(str)	
+#define PyString_Size(str) PyBytes_Size(str)
 #define PyString_InternFromString(key) PyUnicode_InternFromString(key)
 #define Py_TPFLAGS_HAVE_CLASS Py_TPFLAGS_BASETYPE
 #define PyString_AS_STRING(x) PyUnicode_AS_STRING(x)
@@ -758,32 +774,29 @@ SWIG_UnpackDataName(const char *c, void *ptr, size_t sz, const char *name) {
 #endif
 
 #ifndef Py_TYPE
-#  define Py_TYPE(op) ((op)->ob_type)
+#define Py_TYPE(op) ((op)->ob_type)
 #endif
 
 /* SWIG APIs for compatibility of both Python 2 & 3 */
 
 #if PY_VERSION_HEX >= 0x03000000
-#  define SWIG_Python_str_FromFormat PyUnicode_FromFormat
+#define SWIG_Python_str_FromFormat PyUnicode_FromFormat
 #else
-#  define SWIG_Python_str_FromFormat PyString_FromFormat
+#define SWIG_Python_str_FromFormat PyString_FromFormat
 #endif
 
-
 /* Warning: This function will allocate a new string in Python 3,
  * so please call SWIG_Python_str_DelForPy3(x) to free the space.
  */
-SWIGINTERN char*
-SWIG_Python_str_AsChar(PyObject *str)
-{
+SWIGINTERN char *SWIG_Python_str_AsChar(PyObject *str) {
 #if PY_VERSION_HEX >= 0x03000000
   char *cstr;
   char *newstr;
   Py_ssize_t len;
   str = PyUnicode_AsUTF8String(str);
   PyBytes_AsStringAndSize(str, &cstr, &len);
-  newstr = (char *) malloc(len+1);
-  memcpy(newstr, cstr, len+1);
+  newstr = (char *)malloc(len + 1);
+  memcpy(newstr, cstr, len + 1);
   Py_XDECREF(str);
   return newstr;
 #else
@@ -792,17 +805,14 @@ SWIG_Python_str_AsChar(PyObject *str)
 }
 
 #if PY_VERSION_HEX >= 0x03000000
-#  define SWIG_Python_str_DelForPy3(x) free( (void*) (x) )
+#define SWIG_Python_str_DelForPy3(x) free((void *)(x))
 #else
-#  define SWIG_Python_str_DelForPy3(x) 
+#define SWIG_Python_str_DelForPy3(x)
 #endif
 
-
-SWIGINTERN PyObject*
-SWIG_Python_str_FromChar(const char *c)
-{
+SWIGINTERN PyObject *SWIG_Python_str_FromChar(const char *c) {
 #if PY_VERSION_HEX >= 0x03000000
-  return PyUnicode_FromString(c); 
+  return PyUnicode_FromString(c);
 #else
   return PyString_FromString(c);
 #endif
@@ -810,22 +820,21 @@ SWIG_Python_str_FromChar(const char *c)
 
 /* Add PyOS_snprintf for old Pythons */
 #if PY_VERSION_HEX < 0x02020000
-# if defined(_MSC_VER) || defined(__BORLANDC__) || defined(_WATCOM)
-#  define PyOS_snprintf _snprintf
-# else
-#  define PyOS_snprintf snprintf
-# endif
+#if defined(_MSC_VER) || defined(__BORLANDC__) || defined(_WATCOM)
+#define PyOS_snprintf _snprintf
+#else
+#define PyOS_snprintf snprintf
+#endif
 #endif
 
 /* A crude PyString_FromFormat implementation for old Pythons */
 #if PY_VERSION_HEX < 0x02020000
 
 #ifndef SWIG_PYBUFFER_SIZE
-# define SWIG_PYBUFFER_SIZE 1024
+#define SWIG_PYBUFFER_SIZE 1024
 #endif
 
-static PyObject *
-PyString_FromFormat(const char *fmt, ...) {
+static PyObject *PyString_FromFormat(const char *fmt, ...) {
   va_list ap;
   char buf[SWIG_PYBUFFER_SIZE * 2];
   int res;
@@ -838,48 +847,50 @@ PyString_FromFormat(const char *fmt, ...) {
 
 /* Add PyObject_Del for old Pythons */
 #if PY_VERSION_HEX < 0x01060000
-# define PyObject_Del(op) PyMem_DEL((op))
+#define PyObject_Del(op) PyMem_DEL((op))
 #endif
 #ifndef PyObject_DEL
-# define PyObject_DEL PyObject_Del
+#define PyObject_DEL PyObject_Del
 #endif
 
 /* A crude PyExc_StopIteration exception for old Pythons */
 #if PY_VERSION_HEX < 0x02020000
-# ifndef PyExc_StopIteration
-#  define PyExc_StopIteration PyExc_RuntimeError
-# endif
-# ifndef PyObject_GenericGetAttr
-#  define PyObject_GenericGetAttr 0
-# endif
+#ifndef PyExc_StopIteration
+#define PyExc_StopIteration PyExc_RuntimeError
+#endif
+#ifndef PyObject_GenericGetAttr
+#define PyObject_GenericGetAttr 0
+#endif
 #endif
 
 /* Py_NotImplemented is defined in 2.1 and up. */
 #if PY_VERSION_HEX < 0x02010000
-# ifndef Py_NotImplemented
-#  define Py_NotImplemented PyExc_RuntimeError
-# endif
+#ifndef Py_NotImplemented
+#define Py_NotImplemented PyExc_RuntimeError
+#endif
 #endif
 
 /* A crude PyString_AsStringAndSize implementation for old Pythons */
 #if PY_VERSION_HEX < 0x02010000
-# ifndef PyString_AsStringAndSize
-#  define PyString_AsStringAndSize(obj, s, len) {*s = PyString_AsString(obj); *len = *s ? strlen(*s) : 0;}
-# endif
+#ifndef PyString_AsStringAndSize
+#define PyString_AsStringAndSize(obj, s, len)                                  \
+  {                                                                            \
+    *s = PyString_AsString(obj);                                               \
+    *len = *s ? strlen(*s) : 0;                                                \
+  }
+#endif
 #endif
 
 /* PySequence_Size for old Pythons */
 #if PY_VERSION_HEX < 0x02000000
-# ifndef PySequence_Size
-#  define PySequence_Size PySequence_Length
-# endif
+#ifndef PySequence_Size
+#define PySequence_Size PySequence_Length
+#endif
 #endif
 
 /* PyBool_FromLong for old Pythons */
 #if PY_VERSION_HEX < 0x02030000
-static
-PyObject *PyBool_FromLong(long ok)
-{
+static PyObject *PyBool_FromLong(long ok) {
   PyObject *result = ok ? Py_True : Py_False;
   Py_INCREF(result);
   return result;
@@ -891,8 +902,8 @@ PyObject *PyBool_FromLong(long ok)
 /* http://www.python.org/dev/peps/pep-0353/#conversion-guidelines */
 #if PY_VERSION_HEX < 0x02050000 && !defined(PY_SSIZE_T_MIN)
 typedef int Py_ssize_t;
-# define PY_SSIZE_T_MAX INT_MAX
-# define PY_SSIZE_T_MIN INT_MIN
+#define PY_SSIZE_T_MAX INT_MAX
+#define PY_SSIZE_T_MIN INT_MIN
 typedef inquiry lenfunc;
 typedef intargfunc ssizeargfunc;
 typedef intintargfunc ssizessizeargfunc;
@@ -902,8 +913,7 @@ typedef getreadbufferproc readbufferproc;
 typedef getwritebufferproc writebufferproc;
 typedef getsegcountproc segcountproc;
 typedef getcharbufferproc charbufferproc;
-static long PyNumber_AsSsize_t (PyObject *x, void *SWIGUNUSEDPARM(exc))
-{
+static long PyNumber_AsSsize_t(PyObject *x, void *SWIGUNUSEDPARM(exc)) {
   long result = 0;
   PyObject *i = PyNumber_Int(x);
   if (i) {
@@ -919,13 +929,13 @@ static long PyNumber_AsSsize_t (PyObject *x, void *SWIGUNUSEDPARM(exc))
 #endif
 
 #if PY_VERSION_HEX < 0x02040000
-#define Py_VISIT(op)				\
-  do { 						\
-    if (op) {					\
-      int vret = visit((op), arg);		\
-      if (vret)					\
-        return vret;				\
-    }						\
+#define Py_VISIT(op)                                                           \
+  do {                                                                         \
+    if (op) {                                                                  \
+      int vret = visit((op), arg);                                             \
+      if (vret)                                                                \
+        return vret;                                                           \
+    }                                                                          \
   } while (0)
 #endif
 
@@ -944,11 +954,13 @@ typedef struct {
 typedef destructor freefunc;
 #endif
 
-#if ((PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 6) || \
-     (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 0) || \
+#if ((PY_MAJOR_VERSION == 2 && PY_MINOR_VERSION > 6) ||                        \
+     (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION > 0) ||                        \
      (PY_MAJOR_VERSION > 3))
-# define SWIGPY_USE_CAPSULE
-# define SWIGPY_CAPSULE_NAME ((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION ".type_pointer_capsule" SWIG_TYPE_TABLE_NAME)
+#define SWIGPY_USE_CAPSULE
+#define SWIGPY_CAPSULE_NAME                                                    \
+  ((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION                            \
+           ".type_pointer_capsule" SWIG_TYPE_TABLE_NAME)
 #endif
 
 #if PY_VERSION_HEX < 0x03020000
@@ -958,12 +970,12 @@ typedef destructor freefunc;
 
 /* -----------------------------------------------------------------------------
  * error manipulation
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
-SWIGRUNTIME PyObject*
-SWIG_Python_ErrorType(int code) {
-  PyObject* type = 0;
-  switch(code) {
+SWIGRUNTIME PyObject *SWIG_Python_ErrorType(int code) {
+  PyObject *type = 0;
+  switch (code) {
   case SWIG_MemoryError:
     type = PyExc_MemoryError;
     break;
@@ -1003,15 +1015,13 @@ SWIG_Python_ErrorType(int code) {
   return type;
 }
 
-
-SWIGRUNTIME void
-SWIG_Python_AddErrorMsg(const char* mesg)
-{
+SWIGRUNTIME void SWIG_Python_AddErrorMsg(const char *mesg) {
   PyObject *type = 0;
   PyObject *value = 0;
   PyObject *traceback = 0;
 
-  if (PyErr_Occurred()) PyErr_Fetch(&type, &value, &traceback);
+  if (PyErr_Occurred())
+    PyErr_Fetch(&type, &value, &traceback);
   if (value) {
     char *tmp;
     PyObject *old_str = PyObject_Str(value);
@@ -1028,75 +1038,93 @@ SWIG_Python_AddErrorMsg(const char* mesg)
 }
 
 #if defined(SWIG_PYTHON_NO_THREADS)
-#  if defined(SWIG_PYTHON_THREADS)
-#    undef SWIG_PYTHON_THREADS
-#  endif
+#if defined(SWIG_PYTHON_THREADS)
+#undef SWIG_PYTHON_THREADS
+#endif
 #endif
 #if defined(SWIG_PYTHON_THREADS) /* Threading support is enabled */
-#  if !defined(SWIG_PYTHON_USE_GIL) && !defined(SWIG_PYTHON_NO_USE_GIL)
-#    if (PY_VERSION_HEX >= 0x02030000) /* For 2.3 or later, use the PyGILState calls */
-#      define SWIG_PYTHON_USE_GIL
-#    endif
-#  endif
-#  if defined(SWIG_PYTHON_USE_GIL) /* Use PyGILState threads calls */
-#    ifndef SWIG_PYTHON_INITIALIZE_THREADS
-#     define SWIG_PYTHON_INITIALIZE_THREADS  PyEval_InitThreads() 
-#    endif
-#    ifdef __cplusplus /* C++ code */
-       class SWIG_Python_Thread_Block {
-         bool status;
-         PyGILState_STATE state;
-       public:
-         void end() { if (status) { PyGILState_Release(state); status = false;} }
-         SWIG_Python_Thread_Block() : status(true), state(PyGILState_Ensure()) {}
-         ~SWIG_Python_Thread_Block() { end(); }
-       };
-       class SWIG_Python_Thread_Allow {
-         bool status;
-         PyThreadState *save;
-       public:
-         void end() { if (status) { PyEval_RestoreThread(save); status = false; }}
-         SWIG_Python_Thread_Allow() : status(true), save(PyEval_SaveThread()) {}
-         ~SWIG_Python_Thread_Allow() { end(); }
-       };
-#      define SWIG_PYTHON_THREAD_BEGIN_BLOCK   SWIG_Python_Thread_Block _swig_thread_block
-#      define SWIG_PYTHON_THREAD_END_BLOCK     _swig_thread_block.end()
-#      define SWIG_PYTHON_THREAD_BEGIN_ALLOW   SWIG_Python_Thread_Allow _swig_thread_allow
-#      define SWIG_PYTHON_THREAD_END_ALLOW     _swig_thread_allow.end()
-#    else /* C code */
-#      define SWIG_PYTHON_THREAD_BEGIN_BLOCK   PyGILState_STATE _swig_thread_block = PyGILState_Ensure()
-#      define SWIG_PYTHON_THREAD_END_BLOCK     PyGILState_Release(_swig_thread_block)
-#      define SWIG_PYTHON_THREAD_BEGIN_ALLOW   PyThreadState *_swig_thread_allow = PyEval_SaveThread()
-#      define SWIG_PYTHON_THREAD_END_ALLOW     PyEval_RestoreThread(_swig_thread_allow)
-#    endif
-#  else /* Old thread way, not implemented, user must provide it */
-#    if !defined(SWIG_PYTHON_INITIALIZE_THREADS)
-#      define SWIG_PYTHON_INITIALIZE_THREADS
-#    endif
-#    if !defined(SWIG_PYTHON_THREAD_BEGIN_BLOCK)
-#      define SWIG_PYTHON_THREAD_BEGIN_BLOCK
-#    endif
-#    if !defined(SWIG_PYTHON_THREAD_END_BLOCK)
-#      define SWIG_PYTHON_THREAD_END_BLOCK
-#    endif
-#    if !defined(SWIG_PYTHON_THREAD_BEGIN_ALLOW)
-#      define SWIG_PYTHON_THREAD_BEGIN_ALLOW
-#    endif
-#    if !defined(SWIG_PYTHON_THREAD_END_ALLOW)
-#      define SWIG_PYTHON_THREAD_END_ALLOW
-#    endif
-#  endif
+#if !defined(SWIG_PYTHON_USE_GIL) && !defined(SWIG_PYTHON_NO_USE_GIL)
+#if (PY_VERSION_HEX >=                                                         \
+     0x02030000) /* For 2.3 or later, use the PyGILState calls */
+#define SWIG_PYTHON_USE_GIL
+#endif
+#endif
+#if defined(SWIG_PYTHON_USE_GIL) /* Use PyGILState threads calls */
+#ifndef SWIG_PYTHON_INITIALIZE_THREADS
+#define SWIG_PYTHON_INITIALIZE_THREADS PyEval_InitThreads()
+#endif
+#ifdef __cplusplus /* C++ code */
+class SWIG_Python_Thread_Block {
+  bool status;
+  PyGILState_STATE state;
+
+public:
+  void end() {
+    if (status) {
+      PyGILState_Release(state);
+      status = false;
+    }
+  }
+  SWIG_Python_Thread_Block() : status(true), state(PyGILState_Ensure()) {}
+  ~SWIG_Python_Thread_Block() { end(); }
+};
+class SWIG_Python_Thread_Allow {
+  bool status;
+  PyThreadState *save;
+
+public:
+  void end() {
+    if (status) {
+      PyEval_RestoreThread(save);
+      status = false;
+    }
+  }
+  SWIG_Python_Thread_Allow() : status(true), save(PyEval_SaveThread()) {}
+  ~SWIG_Python_Thread_Allow() { end(); }
+};
+#define SWIG_PYTHON_THREAD_BEGIN_BLOCK                                         \
+  SWIG_Python_Thread_Block _swig_thread_block
+#define SWIG_PYTHON_THREAD_END_BLOCK _swig_thread_block.end()
+#define SWIG_PYTHON_THREAD_BEGIN_ALLOW                                         \
+  SWIG_Python_Thread_Allow _swig_thread_allow
+#define SWIG_PYTHON_THREAD_END_ALLOW _swig_thread_allow.end()
+#else /* C code */
+#define SWIG_PYTHON_THREAD_BEGIN_BLOCK                                         \
+  PyGILState_STATE _swig_thread_block = PyGILState_Ensure()
+#define SWIG_PYTHON_THREAD_END_BLOCK PyGILState_Release(_swig_thread_block)
+#define SWIG_PYTHON_THREAD_BEGIN_ALLOW                                         \
+  PyThreadState *_swig_thread_allow = PyEval_SaveThread()
+#define SWIG_PYTHON_THREAD_END_ALLOW PyEval_RestoreThread(_swig_thread_allow)
+#endif
+#else /* Old thread way, not implemented, user must provide it */
+#if !defined(SWIG_PYTHON_INITIALIZE_THREADS)
+#define SWIG_PYTHON_INITIALIZE_THREADS
+#endif
+#if !defined(SWIG_PYTHON_THREAD_BEGIN_BLOCK)
+#define SWIG_PYTHON_THREAD_BEGIN_BLOCK
+#endif
+#if !defined(SWIG_PYTHON_THREAD_END_BLOCK)
+#define SWIG_PYTHON_THREAD_END_BLOCK
+#endif
+#if !defined(SWIG_PYTHON_THREAD_BEGIN_ALLOW)
+#define SWIG_PYTHON_THREAD_BEGIN_ALLOW
+#endif
+#if !defined(SWIG_PYTHON_THREAD_END_ALLOW)
+#define SWIG_PYTHON_THREAD_END_ALLOW
+#endif
+#endif
 #else /* No thread support */
-#  define SWIG_PYTHON_INITIALIZE_THREADS
-#  define SWIG_PYTHON_THREAD_BEGIN_BLOCK
-#  define SWIG_PYTHON_THREAD_END_BLOCK
-#  define SWIG_PYTHON_THREAD_BEGIN_ALLOW
-#  define SWIG_PYTHON_THREAD_END_ALLOW
+#define SWIG_PYTHON_INITIALIZE_THREADS
+#define SWIG_PYTHON_THREAD_BEGIN_BLOCK
+#define SWIG_PYTHON_THREAD_END_BLOCK
+#define SWIG_PYTHON_THREAD_BEGIN_ALLOW
+#define SWIG_PYTHON_THREAD_END_ALLOW
 #endif
 
 /* -----------------------------------------------------------------------------
  * Python API portion that goes into the runtime
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
 #ifdef __cplusplus
 extern "C" {
@@ -1104,11 +1132,12 @@ extern "C" {
 
 /* -----------------------------------------------------------------------------
  * Constant declarations
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
 /* Constant Types */
 #define SWIG_PY_POINTER 4
-#define SWIG_PY_BINARY  5
+#define SWIG_PY_BINARY 5
 
 /* Constant information structure */
 typedef struct swig_const_info {
@@ -1116,23 +1145,24 @@ typedef struct swig_const_info {
   char *name;
   long lvalue;
   double dvalue;
-  void   *pvalue;
+  void *pvalue;
   swig_type_info **ptype;
 } swig_const_info;
 
-
 /* -----------------------------------------------------------------------------
  * Wrapper of PyInstanceMethod_New() used in Python 3
  * It is exported to the generated module, used for -fastproxy
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 #if PY_VERSION_HEX >= 0x03000000
-SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *func)
-{
+SWIGRUNTIME PyObject *SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self),
+                                                PyObject *func) {
   return PyInstanceMethod_New(func);
 }
 #else
-SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self), PyObject *SWIGUNUSEDPARM(func))
-{
+SWIGRUNTIME PyObject *
+SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self),
+                          PyObject *SWIGUNUSEDPARM(func)) {
   return NULL;
 }
 #endif
@@ -1141,7 +1171,6 @@ SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self),
 }
 #endif
 
-
 /* -----------------------------------------------------------------------------
  * pyrun.swg
  *
@@ -1149,91 +1178,100 @@ SWIGRUNTIME PyObject* SWIG_PyInstanceMethod_New(PyObject *SWIGUNUSEDPARM(self),
  * and includes code for managing global variables and pointer
  * type checking.
  *
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
 /* Common SWIG API */
 
 /* for raw pointers */
-#define SWIG_Python_ConvertPtr(obj, pptr, type, flags)  SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, 0)
-#define SWIG_ConvertPtr(obj, pptr, type, flags)         SWIG_Python_ConvertPtr(obj, pptr, type, flags)
-#define SWIG_ConvertPtrAndOwn(obj,pptr,type,flags,own)  SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, own)
+#define SWIG_Python_ConvertPtr(obj, pptr, type, flags)                         \
+  SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, 0)
+#define SWIG_ConvertPtr(obj, pptr, type, flags)                                \
+  SWIG_Python_ConvertPtr(obj, pptr, type, flags)
+#define SWIG_ConvertPtrAndOwn(obj, pptr, type, flags, own)                     \
+  SWIG_Python_ConvertPtrAndOwn(obj, pptr, type, flags, own)
 
 #ifdef SWIGPYTHON_BUILTIN
-#define SWIG_NewPointerObj(ptr, type, flags)            SWIG_Python_NewPointerObj(self, ptr, type, flags)
+#define SWIG_NewPointerObj(ptr, type, flags)                                   \
+  SWIG_Python_NewPointerObj(self, ptr, type, flags)
 #else
-#define SWIG_NewPointerObj(ptr, type, flags)            SWIG_Python_NewPointerObj(NULL, ptr, type, flags)
+#define SWIG_NewPointerObj(ptr, type, flags)                                   \
+  SWIG_Python_NewPointerObj(NULL, ptr, type, flags)
 #endif
 
-#define SWIG_InternalNewPointerObj(ptr, type, flags)	SWIG_Python_NewPointerObj(NULL, ptr, type, flags)
+#define SWIG_InternalNewPointerObj(ptr, type, flags)                           \
+  SWIG_Python_NewPointerObj(NULL, ptr, type, flags)
 
-#define SWIG_CheckImplicit(ty)                          SWIG_Python_CheckImplicit(ty) 
-#define SWIG_AcquirePtr(ptr, src)                       SWIG_Python_AcquirePtr(ptr, src)
-#define swig_owntype                                    int
+#define SWIG_CheckImplicit(ty) SWIG_Python_CheckImplicit(ty)
+#define SWIG_AcquirePtr(ptr, src) SWIG_Python_AcquirePtr(ptr, src)
+#define swig_owntype int
 
 /* for raw packed data */
-#define SWIG_ConvertPacked(obj, ptr, sz, ty)            SWIG_Python_ConvertPacked(obj, ptr, sz, ty)
-#define SWIG_NewPackedObj(ptr, sz, type)                SWIG_Python_NewPackedObj(ptr, sz, type)
+#define SWIG_ConvertPacked(obj, ptr, sz, ty)                                   \
+  SWIG_Python_ConvertPacked(obj, ptr, sz, ty)
+#define SWIG_NewPackedObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type)
 
 /* for class or struct pointers */
-#define SWIG_ConvertInstance(obj, pptr, type, flags)    SWIG_ConvertPtr(obj, pptr, type, flags)
-#define SWIG_NewInstanceObj(ptr, type, flags)           SWIG_NewPointerObj(ptr, type, flags)
+#define SWIG_ConvertInstance(obj, pptr, type, flags)                           \
+  SWIG_ConvertPtr(obj, pptr, type, flags)
+#define SWIG_NewInstanceObj(ptr, type, flags)                                  \
+  SWIG_NewPointerObj(ptr, type, flags)
 
 /* for C or C++ function pointers */
-#define SWIG_ConvertFunctionPtr(obj, pptr, type)        SWIG_Python_ConvertFunctionPtr(obj, pptr, type)
-#define SWIG_NewFunctionPtrObj(ptr, type)               SWIG_Python_NewPointerObj(NULL, ptr, type, 0)
+#define SWIG_ConvertFunctionPtr(obj, pptr, type)                               \
+  SWIG_Python_ConvertFunctionPtr(obj, pptr, type)
+#define SWIG_NewFunctionPtrObj(ptr, type)                                      \
+  SWIG_Python_NewPointerObj(NULL, ptr, type, 0)
 
 /* for C++ member pointers, ie, member methods */
-#define SWIG_ConvertMember(obj, ptr, sz, ty)            SWIG_Python_ConvertPacked(obj, ptr, sz, ty)
-#define SWIG_NewMemberObj(ptr, sz, type)                SWIG_Python_NewPackedObj(ptr, sz, type)
-
+#define SWIG_ConvertMember(obj, ptr, sz, ty)                                   \
+  SWIG_Python_ConvertPacked(obj, ptr, sz, ty)
+#define SWIG_NewMemberObj(ptr, sz, type) SWIG_Python_NewPackedObj(ptr, sz, type)
 
 /* Runtime API */
 
-#define SWIG_GetModule(clientdata)                      SWIG_Python_GetModule(clientdata)
-#define SWIG_SetModule(clientdata, pointer)             SWIG_Python_SetModule(pointer)
-#define SWIG_NewClientData(obj)                         SwigPyClientData_New(obj)
-
-#define SWIG_SetErrorObj                                SWIG_Python_SetErrorObj                            
-#define SWIG_SetErrorMsg                        	SWIG_Python_SetErrorMsg				   
-#define SWIG_ErrorType(code)                    	SWIG_Python_ErrorType(code)                        
-#define SWIG_Error(code, msg)            		SWIG_Python_SetErrorMsg(SWIG_ErrorType(code), msg) 
-#define SWIG_fail                        		goto fail					   
+#define SWIG_GetModule(clientdata) SWIG_Python_GetModule(clientdata)
+#define SWIG_SetModule(clientdata, pointer) SWIG_Python_SetModule(pointer)
+#define SWIG_NewClientData(obj) SwigPyClientData_New(obj)
 
+#define SWIG_SetErrorObj SWIG_Python_SetErrorObj
+#define SWIG_SetErrorMsg SWIG_Python_SetErrorMsg
+#define SWIG_ErrorType(code) SWIG_Python_ErrorType(code)
+#define SWIG_Error(code, msg) SWIG_Python_SetErrorMsg(SWIG_ErrorType(code), msg)
+#define SWIG_fail goto fail
 
 /* Runtime API implementation */
 
 /* Error manipulation */
 
-SWIGINTERN void 
-SWIG_Python_SetErrorObj(PyObject *errtype, PyObject *obj) {
-  SWIG_PYTHON_THREAD_BEGIN_BLOCK; 
+SWIGINTERN void SWIG_Python_SetErrorObj(PyObject *errtype, PyObject *obj) {
+  SWIG_PYTHON_THREAD_BEGIN_BLOCK;
   PyErr_SetObject(errtype, obj);
   Py_DECREF(obj);
   SWIG_PYTHON_THREAD_END_BLOCK;
 }
 
-SWIGINTERN void 
-SWIG_Python_SetErrorMsg(PyObject *errtype, const char *msg) {
+SWIGINTERN void SWIG_Python_SetErrorMsg(PyObject *errtype, const char *msg) {
   SWIG_PYTHON_THREAD_BEGIN_BLOCK;
   PyErr_SetString(errtype, msg);
   SWIG_PYTHON_THREAD_END_BLOCK;
 }
 
-#define SWIG_Python_Raise(obj, type, desc)  SWIG_Python_SetErrorObj(SWIG_Python_ExceptionType(desc), obj)
+#define SWIG_Python_Raise(obj, type, desc)                                     \
+  SWIG_Python_SetErrorObj(SWIG_Python_ExceptionType(desc), obj)
 
 /* Set a constant value */
 
 #if defined(SWIGPYTHON_BUILTIN)
 
-SWIGINTERN void
-SwigPyBuiltin_AddPublicSymbol(PyObject *seq, const char *key) {
+SWIGINTERN void SwigPyBuiltin_AddPublicSymbol(PyObject *seq, const char *key) {
   PyObject *s = PyString_InternFromString(key);
   PyList_Append(seq, s);
   Py_DECREF(s);
 }
 
-SWIGINTERN void
-SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface, const char *name, PyObject *obj) {   
+SWIGINTERN void SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface,
+                                        const char *name, PyObject *obj) {
 #if PY_VERSION_HEX < 0x02030000
   PyDict_SetItemString(d, (char *)name, obj);
 #else
@@ -1246,22 +1284,21 @@ SWIG_Python_SetConstant(PyObject *d, PyObject *public_interface, const char *nam
 
 #else
 
-SWIGINTERN void
-SWIG_Python_SetConstant(PyObject *d, const char *name, PyObject *obj) {   
+SWIGINTERN void SWIG_Python_SetConstant(PyObject *d, const char *name,
+                                        PyObject *obj) {
 #if PY_VERSION_HEX < 0x02030000
   PyDict_SetItemString(d, (char *)name, obj);
 #else
   PyDict_SetItemString(d, name, obj);
 #endif
-  Py_DECREF(obj);                            
+  Py_DECREF(obj);
 }
 
 #endif
 
 /* Append a value to the result obj */
 
-SWIGINTERN PyObject*
-SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) {
+SWIGINTERN PyObject *SWIG_Python_AppendOutput(PyObject *result, PyObject *obj) {
 #if !defined(SWIG_PYTHON_OUTPUT_TUPLE)
   if (!result) {
     result = obj;
@@ -1274,13 +1311,13 @@ SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) {
       result = PyList_New(1);
       PyList_SetItem(result, 0, o2);
     }
-    PyList_Append(result,obj);
+    PyList_Append(result, obj);
     Py_DECREF(obj);
   }
   return result;
 #else
-  PyObject*   o2;
-  PyObject*   o3;
+  PyObject *o2;
+  PyObject *o3;
   if (!result) {
     result = obj;
   } else if (result == Py_None) {
@@ -1305,57 +1342,60 @@ SWIG_Python_AppendOutput(PyObject* result, PyObject* obj) {
 
 /* Unpack the argument tuple */
 
-SWIGINTERN Py_ssize_t
-SWIG_Python_UnpackTuple(PyObject *args, const char *name, Py_ssize_t min, Py_ssize_t max, PyObject **objs)
-{
+SWIGINTERN Py_ssize_t SWIG_Python_UnpackTuple(PyObject *args, const char *name,
+                                              Py_ssize_t min, Py_ssize_t max,
+                                              PyObject **objs) {
   if (!args) {
     if (!min && !max) {
       return 1;
     } else {
-      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got none", 
-		   name, (min == max ? "" : "at least "), (int)min);
+      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got none",
+                   name, (min == max ? "" : "at least "), (int)min);
       return 0;
     }
-  }  
+  }
   if (!PyTuple_Check(args)) {
     if (min <= 1 && max >= 1) {
       Py_ssize_t i;
       objs[0] = args;
       for (i = 1; i < max; ++i) {
-	objs[i] = 0;
+        objs[i] = 0;
       }
       return 2;
     }
-    PyErr_SetString(PyExc_SystemError, "UnpackTuple() argument list is not a tuple");
+    PyErr_SetString(PyExc_SystemError,
+                    "UnpackTuple() argument list is not a tuple");
     return 0;
   } else {
     Py_ssize_t l = PyTuple_GET_SIZE(args);
     if (l < min) {
-      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", 
-		   name, (min == max ? "" : "at least "), (int)min, (int)l);
+      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", name,
+                   (min == max ? "" : "at least "), (int)min, (int)l);
       return 0;
     } else if (l > max) {
-      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", 
-		   name, (min == max ? "" : "at most "), (int)max, (int)l);
+      PyErr_Format(PyExc_TypeError, "%s expected %s%d arguments, got %d", name,
+                   (min == max ? "" : "at most "), (int)max, (int)l);
       return 0;
     } else {
       Py_ssize_t i;
       for (i = 0; i < l; ++i) {
-	objs[i] = PyTuple_GET_ITEM(args, i);
+        objs[i] = PyTuple_GET_ITEM(args, i);
       }
       for (; l < max; ++l) {
-	objs[l] = 0;
+        objs[l] = 0;
       }
       return i + 1;
-    }    
+    }
   }
 }
 
 /* A functor is a function object with one single object argument */
 #if PY_VERSION_HEX >= 0x02020000
-#define SWIG_Python_CallFunctor(functor, obj)	        PyObject_CallFunctionObjArgs(functor, obj, NULL);
+#define SWIG_Python_CallFunctor(functor, obj)                                  \
+  PyObject_CallFunctionObjArgs(functor, obj, NULL);
 #else
-#define SWIG_Python_CallFunctor(functor, obj)	        PyObject_CallFunction(functor, "O", obj);
+#define SWIG_Python_CallFunctor(functor, obj)                                  \
+  PyObject_CallFunction(functor, "O", obj);
 #endif
 
 /*
@@ -1363,23 +1403,27 @@ SWIG_Python_UnpackTuple(PyObject *args, const char *name, Py_ssize_t min, Py_ssi
   static PyObject *SWIG_STATIC_POINTER(MyVar) = NewSomething(...);
 */
 #ifdef __cplusplus
-#define SWIG_STATIC_POINTER(var)  var
+#define SWIG_STATIC_POINTER(var) var
 #else
-#define SWIG_STATIC_POINTER(var)  var = 0; if (!var) var
+#define SWIG_STATIC_POINTER(var)                                               \
+  var = 0;                                                                     \
+  if (!var)                                                                    \
+  var
 #endif
 
 /* -----------------------------------------------------------------------------
  * Pointer declarations
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
 /* Flags for new pointer objects */
-#define SWIG_POINTER_NOSHADOW       (SWIG_POINTER_OWN      << 1)
-#define SWIG_POINTER_NEW            (SWIG_POINTER_NOSHADOW | SWIG_POINTER_OWN)
+#define SWIG_POINTER_NOSHADOW (SWIG_POINTER_OWN << 1)
+#define SWIG_POINTER_NEW (SWIG_POINTER_NOSHADOW | SWIG_POINTER_OWN)
 
-#define SWIG_POINTER_IMPLICIT_CONV  (SWIG_POINTER_DISOWN   << 1)
+#define SWIG_POINTER_IMPLICIT_CONV (SWIG_POINTER_DISOWN << 1)
 
-#define SWIG_BUILTIN_TP_INIT	    (SWIG_POINTER_OWN << 2)
-#define SWIG_BUILTIN_INIT	    (SWIG_BUILTIN_TP_INIT | SWIG_POINTER_OWN)
+#define SWIG_BUILTIN_TP_INIT (SWIG_POINTER_OWN << 2)
+#define SWIG_BUILTIN_INIT (SWIG_BUILTIN_TP_INIT | SWIG_POINTER_OWN)
 
 #ifdef __cplusplus
 extern "C" {
@@ -1387,28 +1431,24 @@ extern "C" {
 
 /*  How to access Py_None */
 #if defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__)
-#  ifndef SWIG_PYTHON_NO_BUILD_NONE
-#    ifndef SWIG_PYTHON_BUILD_NONE
-#      define SWIG_PYTHON_BUILD_NONE
-#    endif
-#  endif
+#ifndef SWIG_PYTHON_NO_BUILD_NONE
+#ifndef SWIG_PYTHON_BUILD_NONE
+#define SWIG_PYTHON_BUILD_NONE
+#endif
+#endif
 #endif
 
 #ifdef SWIG_PYTHON_BUILD_NONE
-#  ifdef Py_None
-#   undef Py_None
-#   define Py_None SWIG_Py_None()
-#  endif
-SWIGRUNTIMEINLINE PyObject * 
-_SWIG_Py_None(void)
-{
-  PyObject *none = Py_BuildValue((char*)"");
+#ifdef Py_None
+#undef Py_None
+#define Py_None SWIG_Py_None()
+#endif
+SWIGRUNTIMEINLINE PyObject *_SWIG_Py_None(void) {
+  PyObject *none = Py_BuildValue((char *)"");
   Py_DECREF(none);
   return none;
 }
-SWIGRUNTIME PyObject * 
-SWIG_Py_None(void)
-{
+SWIGRUNTIME PyObject *SWIG_Py_None(void) {
   static PyObject *SWIG_STATIC_POINTER(none) = _SWIG_Py_None();
   return none;
 }
@@ -1416,9 +1456,7 @@ SWIG_Py_None(void)
 
 /* The python void return value */
 
-SWIGRUNTIMEINLINE PyObject * 
-SWIG_Py_Void(void)
-{
+SWIGRUNTIMEINLINE PyObject *SWIG_Py_Void(void) {
   PyObject *none = Py_None;
   Py_INCREF(none);
   return none;
@@ -1436,32 +1474,28 @@ typedef struct {
   PyTypeObject *pytype;
 } SwigPyClientData;
 
-SWIGRUNTIMEINLINE int 
-SWIG_Python_CheckImplicit(swig_type_info *ty)
-{
+SWIGRUNTIMEINLINE int SWIG_Python_CheckImplicit(swig_type_info *ty) {
   SwigPyClientData *data = (SwigPyClientData *)ty->clientdata;
   return data ? data->implicitconv : 0;
 }
 
-SWIGRUNTIMEINLINE PyObject *
-SWIG_Python_ExceptionType(swig_type_info *desc) {
-  SwigPyClientData *data = desc ? (SwigPyClientData *) desc->clientdata : 0;
+SWIGRUNTIMEINLINE PyObject *SWIG_Python_ExceptionType(swig_type_info *desc) {
+  SwigPyClientData *data = desc ? (SwigPyClientData *)desc->clientdata : 0;
   PyObject *klass = data ? data->klass : 0;
   return (klass ? klass : PyExc_RuntimeError);
 }
 
-
-SWIGRUNTIME SwigPyClientData * 
-SwigPyClientData_New(PyObject* obj)
-{
+SWIGRUNTIME SwigPyClientData *SwigPyClientData_New(PyObject *obj) {
   if (!obj) {
     return 0;
   } else {
-    SwigPyClientData *data = (SwigPyClientData *)malloc(sizeof(SwigPyClientData));
+    SwigPyClientData *data =
+        (SwigPyClientData *)malloc(sizeof(SwigPyClientData));
     /* the klass element */
     data->klass = obj;
     Py_INCREF(data->klass);
-    /* the newraw method and newargs arguments used to create a new raw instance */
+    /* the newraw method and newargs arguments used to create a new raw instance
+     */
     if (PyClass_Check(obj)) {
       data->newraw = 0;
       data->newargs = obj;
@@ -1473,16 +1507,17 @@ SwigPyClientData_New(PyObject* obj)
       data->newraw = PyObject_GetAttrString(data->klass, (char *)"__new__");
 #endif
       if (data->newraw) {
-	Py_INCREF(data->newraw);
-	data->newargs = PyTuple_New(1);
-	PyTuple_SetItem(data->newargs, 0, obj);
+        Py_INCREF(data->newraw);
+        data->newargs = PyTuple_New(1);
+        PyTuple_SetItem(data->newargs, 0, obj);
       } else {
-	data->newargs = obj;
+        data->newargs = obj;
       }
       Py_INCREF(data->newargs);
     }
     /* the destroy method, aka as the C++ delete method */
-    data->destroy = PyObject_GetAttrString(data->klass, (char *)"__swig_destroy__");
+    data->destroy =
+        PyObject_GetAttrString(data->klass, (char *)"__swig_destroy__");
     if (PyErr_Occurred()) {
       PyErr_Clear();
       data->destroy = 0;
@@ -1505,8 +1540,7 @@ SwigPyClientData_New(PyObject* obj)
   }
 }
 
-SWIGRUNTIME void 
-SwigPyClientData_Del(SwigPyClientData *data) {
+SWIGRUNTIME void SwigPyClientData_Del(SwigPyClientData *data) {
   Py_XDECREF(data->newraw);
   Py_XDECREF(data->newargs);
   Py_XDECREF(data->destroy);
@@ -1515,8 +1549,7 @@ SwigPyClientData_Del(SwigPyClientData *data) {
 /* =============== SwigPyObject =====================*/
 
 typedef struct {
-  PyObject_HEAD
-  void *ptr;
+  PyObject_HEAD void *ptr;
   swig_type_info *ty;
   int own;
   PyObject *next;
@@ -1525,12 +1558,10 @@ typedef struct {
 #endif
 } SwigPyObject;
 
-
 #ifdef SWIGPYTHON_BUILTIN
 
 SWIGRUNTIME PyObject *
-SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args))
-{
+SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args)) {
   SwigPyObject *sobj = (SwigPyObject *)v;
 
   if (!sobj->dict)
@@ -1542,15 +1573,11 @@ SwigPyObject_get___dict__(PyObject *v, PyObject *SWIGUNUSEDPARM(args))
 
 #endif
 
-SWIGRUNTIME PyObject *
-SwigPyObject_long(SwigPyObject *v)
-{
+SWIGRUNTIME PyObject *SwigPyObject_long(SwigPyObject *v) {
   return PyLong_FromVoidPtr(v->ptr);
 }
 
-SWIGRUNTIME PyObject *
-SwigPyObject_format(const char* fmt, SwigPyObject *v)
-{
+SWIGRUNTIME PyObject *SwigPyObject_format(const char *fmt, SwigPyObject *v) {
   PyObject *res = NULL;
   PyObject *args = PyTuple_New(1);
   if (args) {
@@ -1558,11 +1585,11 @@ SwigPyObject_format(const char* fmt, SwigPyObject *v)
       PyObject *ofmt = SWIG_Python_str_FromChar(fmt);
       if (ofmt) {
 #if PY_VERSION_HEX >= 0x03000000
-	res = PyUnicode_Format(ofmt,args);
+        res = PyUnicode_Format(ofmt, args);
 #else
-	res = PyString_Format(ofmt,args);
+        res = PyString_Format(ofmt, args);
 #endif
-	Py_DECREF(ofmt);
+        Py_DECREF(ofmt);
       }
       Py_DECREF(args);
     }
@@ -1570,16 +1597,12 @@ SwigPyObject_format(const char* fmt, SwigPyObject *v)
   return res;
 }
 
-SWIGRUNTIME PyObject *
-SwigPyObject_oct(SwigPyObject *v)
-{
-  return SwigPyObject_format("%o",v);
+SWIGRUNTIME PyObject *SwigPyObject_oct(SwigPyObject *v) {
+  return SwigPyObject_format("%o", v);
 }
 
-SWIGRUNTIME PyObject *
-SwigPyObject_hex(SwigPyObject *v)
-{
-  return SwigPyObject_format("%x",v);
+SWIGRUNTIME PyObject *SwigPyObject_hex(SwigPyObject *v) {
+  return SwigPyObject_format("%x", v);
 }
 
 SWIGRUNTIME PyObject *
@@ -1590,92 +1613,84 @@ SwigPyObject_repr(SwigPyObject *v, PyObject *args)
 #endif
 {
   const char *name = SWIG_TypePrettyName(v->ty);
-  PyObject *repr = SWIG_Python_str_FromFormat("<Swig Object of type '%s' at %p>", (name ? name : "unknown"), (void *)v);
+  PyObject *repr = SWIG_Python_str_FromFormat(
+      "<Swig Object of type '%s' at %p>", (name ? name : "unknown"), (void *)v);
   if (v->next) {
-# ifdef METH_NOARGS
+#ifdef METH_NOARGS
     PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next);
-# else
+#else
     PyObject *nrep = SwigPyObject_repr((SwigPyObject *)v->next, args);
-# endif
-# if PY_VERSION_HEX >= 0x03000000
+#endif
+#if PY_VERSION_HEX >= 0x03000000
     PyObject *joined = PyUnicode_Concat(repr, nrep);
     Py_DecRef(repr);
     Py_DecRef(nrep);
     repr = joined;
-# else
-    PyString_ConcatAndDel(&repr,nrep);
-# endif
+#else
+    PyString_ConcatAndDel(&repr, nrep);
+#endif
   }
-  return repr;  
+  return repr;
 }
 
-SWIGRUNTIME int
-SwigPyObject_compare(SwigPyObject *v, SwigPyObject *w)
-{
+SWIGRUNTIME int SwigPyObject_compare(SwigPyObject *v, SwigPyObject *w) {
   void *i = v->ptr;
   void *j = w->ptr;
   return (i < j) ? -1 : ((i > j) ? 1 : 0);
 }
 
 /* Added for Python 3.x, would it also be useful for Python 2.x? */
-SWIGRUNTIME PyObject*
-SwigPyObject_richcompare(SwigPyObject *v, SwigPyObject *w, int op)
-{
-  PyObject* res;
-  if( op != Py_EQ && op != Py_NE ) {
+SWIGRUNTIME PyObject *SwigPyObject_richcompare(SwigPyObject *v, SwigPyObject *w,
+                                               int op) {
+  PyObject *res;
+  if (op != Py_EQ && op != Py_NE) {
     Py_INCREF(Py_NotImplemented);
     return Py_NotImplemented;
   }
-  res = PyBool_FromLong( (SwigPyObject_compare(v, w)==0) == (op == Py_EQ) ? 1 : 0);
-  return res;  
+  res = PyBool_FromLong((SwigPyObject_compare(v, w) == 0) == (op == Py_EQ) ? 1
+                                                                           : 0);
+  return res;
 }
 
-
-SWIGRUNTIME PyTypeObject* SwigPyObject_TypeOnce(void);
+SWIGRUNTIME PyTypeObject *SwigPyObject_TypeOnce(void);
 
 #ifdef SWIGPYTHON_BUILTIN
 static swig_type_info *SwigPyObject_stype = 0;
-SWIGRUNTIME PyTypeObject*
-SwigPyObject_type(void) {
-    SwigPyClientData *cd;
-    assert(SwigPyObject_stype);
-    cd = (SwigPyClientData*) SwigPyObject_stype->clientdata;
-    assert(cd);
-    assert(cd->pytype);
-    return cd->pytype;
+SWIGRUNTIME PyTypeObject *SwigPyObject_type(void) {
+  SwigPyClientData *cd;
+  assert(SwigPyObject_stype);
+  cd = (SwigPyClientData *)SwigPyObject_stype->clientdata;
+  assert(cd);
+  assert(cd->pytype);
+  return cd->pytype;
 }
 #else
-SWIGRUNTIME PyTypeObject*
-SwigPyObject_type(void) {
+SWIGRUNTIME PyTypeObject *SwigPyObject_type(void) {
   static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyObject_TypeOnce();
   return type;
 }
 #endif
 
-SWIGRUNTIMEINLINE int
-SwigPyObject_Check(PyObject *op) {
+SWIGRUNTIMEINLINE int SwigPyObject_Check(PyObject *op) {
 #ifdef SWIGPYTHON_BUILTIN
   PyTypeObject *target_tp = SwigPyObject_type();
   if (PyType_IsSubtype(op->ob_type, target_tp))
     return 1;
   return (strcmp(op->ob_type->tp_name, "SwigPyObject") == 0);
 #else
-  return (Py_TYPE(op) == SwigPyObject_type())
-    || (strcmp(Py_TYPE(op)->tp_name,"SwigPyObject") == 0);
+  return (Py_TYPE(op) == SwigPyObject_type()) ||
+         (strcmp(Py_TYPE(op)->tp_name, "SwigPyObject") == 0);
 #endif
 }
 
-SWIGRUNTIME PyObject *
-SwigPyObject_New(void *ptr, swig_type_info *ty, int own);
+SWIGRUNTIME PyObject *SwigPyObject_New(void *ptr, swig_type_info *ty, int own);
 
-SWIGRUNTIME void
-SwigPyObject_dealloc(PyObject *v)
-{
-  SwigPyObject *sobj = (SwigPyObject *) v;
+SWIGRUNTIME void SwigPyObject_dealloc(PyObject *v) {
+  SwigPyObject *sobj = (SwigPyObject *)v;
   PyObject *next = sobj->next;
   if (sobj->own == SWIG_POINTER_OWN) {
     swig_type_info *ty = sobj->ty;
-    SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0;
+    SwigPyClientData *data = ty ? (SwigPyClientData *)ty->clientdata : 0;
     PyObject *destroy = data ? data->destroy : 0;
     if (destroy) {
       /* destroy is always a VARARGS method */
@@ -1687,12 +1702,13 @@ SwigPyObject_dealloc(PyObject *v)
          StopIteration will be active right now, and this needs to
          remain true upon return from SwigPyObject_dealloc.  So save
          and restore. */
-      
+
       PyObject *val = NULL, *type = NULL, *tb = NULL;
       PyErr_Fetch(&val, &type, &tb);
 
       if (data->delargs) {
-        /* we need to create a temporary object to carry the destroy operation */
+        /* we need to create a temporary object to carry the destroy operation
+         */
         PyObject *tmp = SwigPyObject_New(sobj->ptr, ty, 0);
         res = SWIG_Python_CallFunctor(destroy, tmp);
         Py_DECREF(tmp);
@@ -1707,25 +1723,26 @@ SwigPyObject_dealloc(PyObject *v)
       PyErr_Restore(val, type, tb);
 
       Py_XDECREF(res);
-    } 
+    }
 #if !defined(SWIG_PYTHON_SILENT_MEMLEAK)
     else {
       const char *name = SWIG_TypePrettyName(ty);
-      printf("swig/python detected a memory leak of type '%s', no destructor found.\n", (name ? name : "unknown"));
+      printf("swig/python detected a memory leak of type '%s', no destructor "
+             "found.\n",
+             (name ? name : "unknown"));
     }
 #endif
-  } 
+  }
   Py_XDECREF(next);
   PyObject_DEL(v);
 }
 
-SWIGRUNTIME PyObject* 
-SwigPyObject_append(PyObject* v, PyObject* next)
-{
-  SwigPyObject *sobj = (SwigPyObject *) v;
+SWIGRUNTIME PyObject *SwigPyObject_append(PyObject *v, PyObject *next) {
+  SwigPyObject *sobj = (SwigPyObject *)v;
 #ifndef METH_O
   PyObject *tmp = 0;
-  if (!PyArg_ParseTuple(next,(char *)"O:append", &tmp)) return NULL;
+  if (!PyArg_ParseTuple(next, (char *)"O:append", &tmp))
+    return NULL;
   next = tmp;
 #endif
   if (!SwigPyObject_Check(next)) {
@@ -1737,15 +1754,15 @@ SwigPyObject_append(PyObject* v, PyObject* next)
   return SWIG_Py_Void();
 }
 
-SWIGRUNTIME PyObject* 
+SWIGRUNTIME PyObject *
 #ifdef METH_NOARGS
-SwigPyObject_next(PyObject* v)
+SwigPyObject_next(PyObject *v)
 #else
-SwigPyObject_next(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+SwigPyObject_next(PyObject *v, PyObject *SWIGUNUSEDPARM(args))
 #endif
 {
-  SwigPyObject *sobj = (SwigPyObject *) v;
-  if (sobj->next) {    
+  SwigPyObject *sobj = (SwigPyObject *)v;
+  if (sobj->next) {
     Py_INCREF(sobj->next);
     return sobj->next;
   } else {
@@ -1753,11 +1770,11 @@ SwigPyObject_next(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
   }
 }
 
-SWIGINTERN PyObject*
+SWIGINTERN PyObject *
 #ifdef METH_NOARGS
 SwigPyObject_disown(PyObject *v)
 #else
-SwigPyObject_disown(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+SwigPyObject_disown(PyObject *v, PyObject *SWIGUNUSEDPARM(args))
 #endif
 {
   SwigPyObject *sobj = (SwigPyObject *)v;
@@ -1765,11 +1782,11 @@ SwigPyObject_disown(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
   return SWIG_Py_Void();
 }
 
-SWIGINTERN PyObject*
+SWIGINTERN PyObject *
 #ifdef METH_NOARGS
 SwigPyObject_acquire(PyObject *v)
 #else
-SwigPyObject_acquire(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
+SwigPyObject_acquire(PyObject *v, PyObject *SWIGUNUSEDPARM(args))
 #endif
 {
   SwigPyObject *sobj = (SwigPyObject *)v;
@@ -1777,102 +1794,103 @@ SwigPyObject_acquire(PyObject* v, PyObject *SWIGUNUSEDPARM(args))
   return SWIG_Py_Void();
 }
 
-SWIGINTERN PyObject*
-SwigPyObject_own(PyObject *v, PyObject *args)
-{
+SWIGINTERN PyObject *SwigPyObject_own(PyObject *v, PyObject *args) {
   PyObject *val = 0;
 #if (PY_VERSION_HEX < 0x02020000)
-  if (!PyArg_ParseTuple(args,(char *)"|O:own",&val))
+  if (!PyArg_ParseTuple(args, (char *)"|O:own", &val))
 #elif (PY_VERSION_HEX < 0x02050000)
-  if (!PyArg_UnpackTuple(args, (char *)"own", 0, 1, &val)) 
+  if (!PyArg_UnpackTuple(args, (char *)"own", 0, 1, &val))
 #else
-  if (!PyArg_UnpackTuple(args, "own", 0, 1, &val)) 
+  if (!PyArg_UnpackTuple(args, "own", 0, 1, &val))
 #endif
-    {
-      return NULL;
-    } 
-  else
-    {
-      SwigPyObject *sobj = (SwigPyObject *)v;
-      PyObject *obj = PyBool_FromLong(sobj->own);
-      if (val) {
+  {
+    return NULL;
+  } else {
+    SwigPyObject *sobj = (SwigPyObject *)v;
+    PyObject *obj = PyBool_FromLong(sobj->own);
+    if (val) {
 #ifdef METH_NOARGS
-	if (PyObject_IsTrue(val)) {
-	  SwigPyObject_acquire(v);
-	} else {
-	  SwigPyObject_disown(v);
-	}
+      if (PyObject_IsTrue(val)) {
+        SwigPyObject_acquire(v);
+      } else {
+        SwigPyObject_disown(v);
+      }
 #else
-	if (PyObject_IsTrue(val)) {
-	  SwigPyObject_acquire(v,args);
-	} else {
-	  SwigPyObject_disown(v,args);
-	}
-#endif
-      } 
-      return obj;
+      if (PyObject_IsTrue(val)) {
+        SwigPyObject_acquire(v, args);
+      } else {
+        SwigPyObject_disown(v, args);
+      }
+#endif
     }
+    return obj;
+  }
 }
 
 #ifdef METH_O
-static PyMethodDef
-swigobject_methods[] = {
-  {(char *)"disown",  (PyCFunction)SwigPyObject_disown,  METH_NOARGS,  (char *)"releases ownership of the pointer"},
-  {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_NOARGS,  (char *)"acquires ownership of the pointer"},
-  {(char *)"own",     (PyCFunction)SwigPyObject_own,     METH_VARARGS, (char *)"returns/sets ownership of the pointer"},
-  {(char *)"append",  (PyCFunction)SwigPyObject_append,  METH_O,       (char *)"appends another 'this' object"},
-  {(char *)"next",    (PyCFunction)SwigPyObject_next,    METH_NOARGS,  (char *)"returns the next 'this' object"},
-  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr,    METH_NOARGS,  (char *)"returns object representation"},
-  {0, 0, 0, 0}  
-};
+static PyMethodDef swigobject_methods[] = {
+    {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_NOARGS,
+     (char *)"releases ownership of the pointer"},
+    {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_NOARGS,
+     (char *)"acquires ownership of the pointer"},
+    {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS,
+     (char *)"returns/sets ownership of the pointer"},
+    {(char *)"append", (PyCFunction)SwigPyObject_append, METH_O,
+     (char *)"appends another 'this' object"},
+    {(char *)"next", (PyCFunction)SwigPyObject_next, METH_NOARGS,
+     (char *)"returns the next 'this' object"},
+    {(char *)"__repr__", (PyCFunction)SwigPyObject_repr, METH_NOARGS,
+     (char *)"returns object representation"},
+    {0, 0, 0, 0}};
 #else
-static PyMethodDef
-swigobject_methods[] = {
-  {(char *)"disown",  (PyCFunction)SwigPyObject_disown,  METH_VARARGS,  (char *)"releases ownership of the pointer"},
-  {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_VARARGS,  (char *)"acquires ownership of the pointer"},
-  {(char *)"own",     (PyCFunction)SwigPyObject_own,     METH_VARARGS,  (char *)"returns/sets ownership of the pointer"},
-  {(char *)"append",  (PyCFunction)SwigPyObject_append,  METH_VARARGS,  (char *)"appends another 'this' object"},
-  {(char *)"next",    (PyCFunction)SwigPyObject_next,    METH_VARARGS,  (char *)"returns the next 'this' object"},
-  {(char *)"__repr__",(PyCFunction)SwigPyObject_repr,   METH_VARARGS,  (char *)"returns object representation"},
-  {0, 0, 0, 0}  
-};
+static PyMethodDef swigobject_methods[] = {
+    {(char *)"disown", (PyCFunction)SwigPyObject_disown, METH_VARARGS,
+     (char *)"releases ownership of the pointer"},
+    {(char *)"acquire", (PyCFunction)SwigPyObject_acquire, METH_VARARGS,
+     (char *)"acquires ownership of the pointer"},
+    {(char *)"own", (PyCFunction)SwigPyObject_own, METH_VARARGS,
+     (char *)"returns/sets ownership of the pointer"},
+    {(char *)"append", (PyCFunction)SwigPyObject_append, METH_VARARGS,
+     (char *)"appends another 'this' object"},
+    {(char *)"next", (PyCFunction)SwigPyObject_next, METH_VARARGS,
+     (char *)"returns the next 'this' object"},
+    {(char *)"__repr__", (PyCFunction)SwigPyObject_repr, METH_VARARGS,
+     (char *)"returns object representation"},
+    {0, 0, 0, 0}};
 #endif
 
 #if PY_VERSION_HEX < 0x02020000
-SWIGINTERN PyObject *
-SwigPyObject_getattr(SwigPyObject *sobj,char *name)
-{
+SWIGINTERN PyObject *SwigPyObject_getattr(SwigPyObject *sobj, char *name) {
   return Py_FindMethod(swigobject_methods, (PyObject *)sobj, name);
 }
 #endif
 
-SWIGRUNTIME PyTypeObject*
-SwigPyObject_TypeOnce(void) {
+SWIGRUNTIME PyTypeObject *SwigPyObject_TypeOnce(void) {
   static char swigobject_doc[] = "Swig object carries a C/C++ instance pointer";
 
   static PyNumberMethods SwigPyObject_as_number = {
     (binaryfunc)0, /*nb_add*/
     (binaryfunc)0, /*nb_subtract*/
     (binaryfunc)0, /*nb_multiply*/
-    /* nb_divide removed in Python 3 */
+                   /* nb_divide removed in Python 3 */
 #if PY_VERSION_HEX < 0x03000000
     (binaryfunc)0, /*nb_divide*/
 #endif
-    (binaryfunc)0, /*nb_remainder*/
-    (binaryfunc)0, /*nb_divmod*/
-    (ternaryfunc)0,/*nb_power*/
-    (unaryfunc)0,  /*nb_negative*/
-    (unaryfunc)0,  /*nb_positive*/
-    (unaryfunc)0,  /*nb_absolute*/
-    (inquiry)0,    /*nb_nonzero*/
-    0,		   /*nb_invert*/
-    0,		   /*nb_lshift*/
-    0,		   /*nb_rshift*/
-    0,		   /*nb_and*/
-    0,		   /*nb_xor*/
-    0,		   /*nb_or*/
+    (binaryfunc)0,  /*nb_remainder*/
+    (binaryfunc)0,  /*nb_divmod*/
+    (ternaryfunc)0, /*nb_power*/
+    (unaryfunc)0,   /*nb_negative*/
+    (unaryfunc)0,   /*nb_positive*/
+    (unaryfunc)0,   /*nb_absolute*/
+    (inquiry)0,     /*nb_nonzero*/
+    0,              /*nb_invert*/
+    0,              /*nb_lshift*/
+    0,              /*nb_rshift*/
+    0,              /*nb_and*/
+    0,              /*nb_xor*/
+    0,              /*nb_or*/
 #if PY_VERSION_HEX < 0x03000000
-    0,   /*nb_coerce*/
+    0, /*nb_coerce*/
 #endif
     (unaryfunc)SwigPyObject_long, /*nb_int*/
 #if PY_VERSION_HEX < 0x03000000
@@ -1880,21 +1898,90 @@ SwigPyObject_TypeOnce(void) {
 #else
     0, /*nb_reserved*/
 #endif
-    (unaryfunc)0,                 /*nb_float*/
+    (unaryfunc)0, /*nb_float*/
 #if PY_VERSION_HEX < 0x03000000
-    (unaryfunc)SwigPyObject_oct,  /*nb_oct*/
-    (unaryfunc)SwigPyObject_hex,  /*nb_hex*/
+    (unaryfunc)SwigPyObject_oct, /*nb_oct*/
+    (unaryfunc)SwigPyObject_hex, /*nb_hex*/
 #endif
 #if PY_VERSION_HEX >= 0x03050000 /* 3.5 */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_matrix_multiply */
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0 /* nb_inplace_add -> nb_inplace_matrix_multiply */
 #elif PY_VERSION_HEX >= 0x03000000 /* 3.0 */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index, nb_inplace_divide removed */
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0 /* nb_inplace_add -> nb_index, nb_inplace_divide removed */
 #elif PY_VERSION_HEX >= 0x02050000 /* 2.5.0 */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_index */
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0 /* nb_inplace_add -> nb_index */
 #elif PY_VERSION_HEX >= 0x02020000 /* 2.2.0 */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_true_divide */
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0 /* nb_inplace_add -> nb_inplace_true_divide */
 #elif PY_VERSION_HEX >= 0x02000000 /* 2.0.0 */
-    0,0,0,0,0,0,0,0,0,0,0 /* nb_inplace_add -> nb_inplace_or */
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0,
+    0 /* nb_inplace_add -> nb_inplace_or */
 #endif
   };
 
@@ -1902,84 +1989,83 @@ SwigPyObject_TypeOnce(void) {
   static int type_init = 0;
   if (!type_init) {
     const PyTypeObject tmp = {
-      /* PyObject header changed in Python 3 */
+    /* PyObject header changed in Python 3 */
 #if PY_VERSION_HEX >= 0x03000000
       PyVarObject_HEAD_INIT(NULL, 0)
 #else
-      PyObject_HEAD_INIT(NULL)
-      0,                                    /* ob_size */
-#endif
-      (char *)"SwigPyObject",               /* tp_name */
-      sizeof(SwigPyObject),                 /* tp_basicsize */
-      0,                                    /* tp_itemsize */
-      (destructor)SwigPyObject_dealloc,     /* tp_dealloc */
-      0,				    /* tp_print */
+      PyObject_HEAD_INIT(NULL) 0,    /* ob_size */
+#endif
+          (char *) "SwigPyObject",      /* tp_name */
+      sizeof(SwigPyObject),             /* tp_basicsize */
+      0,                                /* tp_itemsize */
+      (destructor)SwigPyObject_dealloc, /* tp_dealloc */
+      0,                                /* tp_print */
 #if PY_VERSION_HEX < 0x02020000
-      (getattrfunc)SwigPyObject_getattr,    /* tp_getattr */
+      (getattrfunc)SwigPyObject_getattr, /* tp_getattr */
 #else
-      (getattrfunc)0,                       /* tp_getattr */
+      (getattrfunc)0,                /* tp_getattr */
 #endif
-      (setattrfunc)0,                       /* tp_setattr */
+      (setattrfunc)0, /* tp_setattr */
 #if PY_VERSION_HEX >= 0x03000000
-    0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */
+      0, /* tp_reserved in 3.0.1, tp_compare in 3.0.0 but not used */
 #else
-      (cmpfunc)SwigPyObject_compare,        /* tp_compare */
-#endif
-      (reprfunc)SwigPyObject_repr,          /* tp_repr */
-      &SwigPyObject_as_number,              /* tp_as_number */
-      0,                                    /* tp_as_sequence */
-      0,                                    /* tp_as_mapping */
-      (hashfunc)0,                          /* tp_hash */
-      (ternaryfunc)0,                       /* tp_call */
-      0,				    /* tp_str */
-      PyObject_GenericGetAttr,              /* tp_getattro */
-      0,                                    /* tp_setattro */
-      0,                                    /* tp_as_buffer */
-      Py_TPFLAGS_DEFAULT,                   /* tp_flags */
-      swigobject_doc,                       /* tp_doc */
-      0,                                    /* tp_traverse */
-      0,                                    /* tp_clear */
-      (richcmpfunc)SwigPyObject_richcompare,/* tp_richcompare */
-      0,                                    /* tp_weaklistoffset */
+      (cmpfunc)SwigPyObject_compare, /* tp_compare */
+#endif
+      (reprfunc)SwigPyObject_repr,           /* tp_repr */
+      &SwigPyObject_as_number,               /* tp_as_number */
+      0,                                     /* tp_as_sequence */
+      0,                                     /* tp_as_mapping */
+      (hashfunc)0,                           /* tp_hash */
+      (ternaryfunc)0,                        /* tp_call */
+      0,                                     /* tp_str */
+      PyObject_GenericGetAttr,               /* tp_getattro */
+      0,                                     /* tp_setattro */
+      0,                                     /* tp_as_buffer */
+      Py_TPFLAGS_DEFAULT,                    /* tp_flags */
+      swigobject_doc,                        /* tp_doc */
+      0,                                     /* tp_traverse */
+      0,                                     /* tp_clear */
+      (richcmpfunc)SwigPyObject_richcompare, /* tp_richcompare */
+      0,                                     /* tp_weaklistoffset */
 #if PY_VERSION_HEX >= 0x02020000
-      0,                                    /* tp_iter */
-      0,                                    /* tp_iternext */
-      swigobject_methods,                   /* tp_methods */
-      0,                                    /* tp_members */
-      0,                                    /* tp_getset */
-      0,                                    /* tp_base */
-      0,                                    /* tp_dict */
-      0,                                    /* tp_descr_get */
-      0,                                    /* tp_descr_set */
-      0,                                    /* tp_dictoffset */
-      0,                                    /* tp_init */
-      0,                                    /* tp_alloc */
-      0,                                    /* tp_new */
-      0,                                    /* tp_free */
-      0,                                    /* tp_is_gc */
-      0,                                    /* tp_bases */
-      0,                                    /* tp_mro */
-      0,                                    /* tp_cache */
-      0,                                    /* tp_subclasses */
-      0,                                    /* tp_weaklist */
+      0,                  /* tp_iter */
+      0,                  /* tp_iternext */
+      swigobject_methods, /* tp_methods */
+      0,                  /* tp_members */
+      0,                  /* tp_getset */
+      0,                  /* tp_base */
+      0,                  /* tp_dict */
+      0,                  /* tp_descr_get */
+      0,                  /* tp_descr_set */
+      0,                  /* tp_dictoffset */
+      0,                  /* tp_init */
+      0,                  /* tp_alloc */
+      0,                  /* tp_new */
+      0,                  /* tp_free */
+      0,                  /* tp_is_gc */
+      0,                  /* tp_bases */
+      0,                  /* tp_mro */
+      0,                  /* tp_cache */
+      0,                  /* tp_subclasses */
+      0,                  /* tp_weaklist */
 #endif
 #if PY_VERSION_HEX >= 0x02030000
-      0,                                    /* tp_del */
+      0, /* tp_del */
 #endif
 #if PY_VERSION_HEX >= 0x02060000
-      0,                                    /* tp_version_tag */
+      0, /* tp_version_tag */
 #endif
 #if PY_VERSION_HEX >= 0x03040000
-      0,                                    /* tp_finalize */
+      0, /* tp_finalize */
 #endif
 #ifdef COUNT_ALLOCS
-      0,                                    /* tp_allocs */
-      0,                                    /* tp_frees */
-      0,                                    /* tp_maxalloc */
+      0, /* tp_allocs */
+      0, /* tp_frees */
+      0, /* tp_maxalloc */
 #if PY_VERSION_HEX >= 0x02050000
-      0,                                    /* tp_prev */
+      0, /* tp_prev */
 #endif
-      0                                     /* tp_next */
+      0 /* tp_next */
 #endif
     };
     swigpyobject_type = tmp;
@@ -1994,14 +2080,12 @@ SwigPyObject_TypeOnce(void) {
   return &swigpyobject_type;
 }
 
-SWIGRUNTIME PyObject *
-SwigPyObject_New(void *ptr, swig_type_info *ty, int own)
-{
+SWIGRUNTIME PyObject *SwigPyObject_New(void *ptr, swig_type_info *ty, int own) {
   SwigPyObject *sobj = PyObject_NEW(SwigPyObject, SwigPyObject_type());
   if (sobj) {
-    sobj->ptr  = ptr;
-    sobj->ty   = ty;
-    sobj->own  = own;
+    sobj->ptr = ptr;
+    sobj->ty = ty;
+    sobj->own = own;
     sobj->next = 0;
   }
   return (PyObject *)sobj;
@@ -2009,165 +2093,153 @@ SwigPyObject_New(void *ptr, swig_type_info *ty, int own)
 
 /* -----------------------------------------------------------------------------
  * Implements a simple Swig Packed type, and use it instead of string
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
 typedef struct {
-  PyObject_HEAD
-  void *pack;
+  PyObject_HEAD void *pack;
   swig_type_info *ty;
   size_t size;
 } SwigPyPacked;
 
-SWIGRUNTIME int
-SwigPyPacked_print(SwigPyPacked *v, FILE *fp, int SWIGUNUSEDPARM(flags))
-{
+SWIGRUNTIME int SwigPyPacked_print(SwigPyPacked *v, FILE *fp,
+                                   int SWIGUNUSEDPARM(flags)) {
   char result[SWIG_BUFFER_SIZE];
-  fputs("<Swig Packed ", fp); 
+  fputs("<Swig Packed ", fp);
   if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) {
-    fputs("at ", fp); 
-    fputs(result, fp); 
+    fputs("at ", fp);
+    fputs(result, fp);
   }
-  fputs(v->ty->name,fp); 
+  fputs(v->ty->name, fp);
   fputs(">", fp);
-  return 0; 
+  return 0;
 }
-  
-SWIGRUNTIME PyObject *
-SwigPyPacked_repr(SwigPyPacked *v)
-{
+
+SWIGRUNTIME PyObject *SwigPyPacked_repr(SwigPyPacked *v) {
   char result[SWIG_BUFFER_SIZE];
   if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) {
-    return SWIG_Python_str_FromFormat("<Swig Packed at %s%s>", result, v->ty->name);
+    return SWIG_Python_str_FromFormat("<Swig Packed at %s%s>", result,
+                                      v->ty->name);
   } else {
     return SWIG_Python_str_FromFormat("<Swig Packed %s>", v->ty->name);
-  }  
+  }
 }
 
-SWIGRUNTIME PyObject *
-SwigPyPacked_str(SwigPyPacked *v)
-{
+SWIGRUNTIME PyObject *SwigPyPacked_str(SwigPyPacked *v) {
   char result[SWIG_BUFFER_SIZE];
-  if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))){
+  if (SWIG_PackDataName(result, v->pack, v->size, 0, sizeof(result))) {
     return SWIG_Python_str_FromFormat("%s%s", result, v->ty->name);
   } else {
     return SWIG_Python_str_FromChar(v->ty->name);
-  }  
+  }
 }
 
-SWIGRUNTIME int
-SwigPyPacked_compare(SwigPyPacked *v, SwigPyPacked *w)
-{
+SWIGRUNTIME int SwigPyPacked_compare(SwigPyPacked *v, SwigPyPacked *w) {
   size_t i = v->size;
   size_t j = w->size;
   int s = (i < j) ? -1 : ((i > j) ? 1 : 0);
-  return s ? s : strncmp((char *)v->pack, (char *)w->pack, 2*v->size);
+  return s ? s : strncmp((char *)v->pack, (char *)w->pack, 2 * v->size);
 }
 
-SWIGRUNTIME PyTypeObject* SwigPyPacked_TypeOnce(void);
+SWIGRUNTIME PyTypeObject *SwigPyPacked_TypeOnce(void);
 
-SWIGRUNTIME PyTypeObject*
-SwigPyPacked_type(void) {
+SWIGRUNTIME PyTypeObject *SwigPyPacked_type(void) {
   static PyTypeObject *SWIG_STATIC_POINTER(type) = SwigPyPacked_TypeOnce();
   return type;
 }
 
-SWIGRUNTIMEINLINE int
-SwigPyPacked_Check(PyObject *op) {
-  return ((op)->ob_type == SwigPyPacked_TypeOnce()) 
-    || (strcmp((op)->ob_type->tp_name,"SwigPyPacked") == 0);
+SWIGRUNTIMEINLINE int SwigPyPacked_Check(PyObject *op) {
+  return ((op)->ob_type == SwigPyPacked_TypeOnce()) ||
+         (strcmp((op)->ob_type->tp_name, "SwigPyPacked") == 0);
 }
 
-SWIGRUNTIME void
-SwigPyPacked_dealloc(PyObject *v)
-{
+SWIGRUNTIME void SwigPyPacked_dealloc(PyObject *v) {
   if (SwigPyPacked_Check(v)) {
-    SwigPyPacked *sobj = (SwigPyPacked *) v;
+    SwigPyPacked *sobj = (SwigPyPacked *)v;
     free(sobj->pack);
   }
   PyObject_DEL(v);
 }
 
-SWIGRUNTIME PyTypeObject*
-SwigPyPacked_TypeOnce(void) {
+SWIGRUNTIME PyTypeObject *SwigPyPacked_TypeOnce(void) {
   static char swigpacked_doc[] = "Swig object carries a C/C++ instance pointer";
   static PyTypeObject swigpypacked_type;
   static int type_init = 0;
   if (!type_init) {
     const PyTypeObject tmp = {
-      /* PyObject header changed in Python 3 */
-#if PY_VERSION_HEX>=0x03000000
+    /* PyObject header changed in Python 3 */
+#if PY_VERSION_HEX >= 0x03000000
       PyVarObject_HEAD_INIT(NULL, 0)
 #else
-      PyObject_HEAD_INIT(NULL)
-      0,                                    /* ob_size */
-#endif
-      (char *)"SwigPyPacked",               /* tp_name */
-      sizeof(SwigPyPacked),                 /* tp_basicsize */
-      0,                                    /* tp_itemsize */
-      (destructor)SwigPyPacked_dealloc,     /* tp_dealloc */
-      (printfunc)SwigPyPacked_print,        /* tp_print */
-      (getattrfunc)0,                       /* tp_getattr */
-      (setattrfunc)0,                       /* tp_setattr */
-#if PY_VERSION_HEX>=0x03000000
+      PyObject_HEAD_INIT(NULL) 0,    /* ob_size */
+#endif
+          (char *) "SwigPyPacked",      /* tp_name */
+      sizeof(SwigPyPacked),             /* tp_basicsize */
+      0,                                /* tp_itemsize */
+      (destructor)SwigPyPacked_dealloc, /* tp_dealloc */
+      (printfunc)SwigPyPacked_print,    /* tp_print */
+      (getattrfunc)0,                   /* tp_getattr */
+      (setattrfunc)0,                   /* tp_setattr */
+#if PY_VERSION_HEX >= 0x03000000
       0, /* tp_reserved in 3.0.1 */
 #else
-      (cmpfunc)SwigPyPacked_compare,        /* tp_compare */
-#endif
-      (reprfunc)SwigPyPacked_repr,          /* tp_repr */
-      0,                                    /* tp_as_number */
-      0,                                    /* tp_as_sequence */
-      0,                                    /* tp_as_mapping */
-      (hashfunc)0,                          /* tp_hash */
-      (ternaryfunc)0,                       /* tp_call */
-      (reprfunc)SwigPyPacked_str,           /* tp_str */
-      PyObject_GenericGetAttr,              /* tp_getattro */
-      0,                                    /* tp_setattro */
-      0,                                    /* tp_as_buffer */
-      Py_TPFLAGS_DEFAULT,                   /* tp_flags */
-      swigpacked_doc,                       /* tp_doc */
-      0,                                    /* tp_traverse */
-      0,                                    /* tp_clear */
-      0,                                    /* tp_richcompare */
-      0,                                    /* tp_weaklistoffset */
+      (cmpfunc)SwigPyPacked_compare, /* tp_compare */
+#endif
+      (reprfunc)SwigPyPacked_repr, /* tp_repr */
+      0,                           /* tp_as_number */
+      0,                           /* tp_as_sequence */
+      0,                           /* tp_as_mapping */
+      (hashfunc)0,                 /* tp_hash */
+      (ternaryfunc)0,              /* tp_call */
+      (reprfunc)SwigPyPacked_str,  /* tp_str */
+      PyObject_GenericGetAttr,     /* tp_getattro */
+      0,                           /* tp_setattro */
+      0,                           /* tp_as_buffer */
+      Py_TPFLAGS_DEFAULT,          /* tp_flags */
+      swigpacked_doc,              /* tp_doc */
+      0,                           /* tp_traverse */
+      0,                           /* tp_clear */
+      0,                           /* tp_richcompare */
+      0,                           /* tp_weaklistoffset */
 #if PY_VERSION_HEX >= 0x02020000
-      0,                                    /* tp_iter */
-      0,                                    /* tp_iternext */
-      0,                                    /* tp_methods */
-      0,                                    /* tp_members */
-      0,                                    /* tp_getset */
-      0,                                    /* tp_base */
-      0,                                    /* tp_dict */
-      0,                                    /* tp_descr_get */
-      0,                                    /* tp_descr_set */
-      0,                                    /* tp_dictoffset */
-      0,                                    /* tp_init */
-      0,                                    /* tp_alloc */
-      0,                                    /* tp_new */
-      0,                                    /* tp_free */
-      0,                                    /* tp_is_gc */
-      0,                                    /* tp_bases */
-      0,                                    /* tp_mro */
-      0,                                    /* tp_cache */
-      0,                                    /* tp_subclasses */
-      0,                                    /* tp_weaklist */
+      0, /* tp_iter */
+      0, /* tp_iternext */
+      0, /* tp_methods */
+      0, /* tp_members */
+      0, /* tp_getset */
+      0, /* tp_base */
+      0, /* tp_dict */
+      0, /* tp_descr_get */
+      0, /* tp_descr_set */
+      0, /* tp_dictoffset */
+      0, /* tp_init */
+      0, /* tp_alloc */
+      0, /* tp_new */
+      0, /* tp_free */
+      0, /* tp_is_gc */
+      0, /* tp_bases */
+      0, /* tp_mro */
+      0, /* tp_cache */
+      0, /* tp_subclasses */
+      0, /* tp_weaklist */
 #endif
 #if PY_VERSION_HEX >= 0x02030000
-      0,                                    /* tp_del */
+      0, /* tp_del */
 #endif
 #if PY_VERSION_HEX >= 0x02060000
-      0,                                    /* tp_version_tag */
+      0, /* tp_version_tag */
 #endif
 #if PY_VERSION_HEX >= 0x03040000
-      0,                                    /* tp_finalize */
+      0, /* tp_finalize */
 #endif
 #ifdef COUNT_ALLOCS
-      0,                                    /* tp_allocs */
-      0,                                    /* tp_frees */
-      0,                                    /* tp_maxalloc */
+      0, /* tp_allocs */
+      0, /* tp_frees */
+      0, /* tp_maxalloc */
 #if PY_VERSION_HEX >= 0x02050000
-      0,                                    /* tp_prev */
+      0, /* tp_prev */
 #endif
-      0                                     /* tp_next */
+      0 /* tp_next */
 #endif
     };
     swigpypacked_type = tmp;
@@ -2182,31 +2254,30 @@ SwigPyPacked_TypeOnce(void) {
   return &swigpypacked_type;
 }
 
-SWIGRUNTIME PyObject *
-SwigPyPacked_New(void *ptr, size_t size, swig_type_info *ty)
-{
+SWIGRUNTIME PyObject *SwigPyPacked_New(void *ptr, size_t size,
+                                       swig_type_info *ty) {
   SwigPyPacked *sobj = PyObject_NEW(SwigPyPacked, SwigPyPacked_type());
   if (sobj) {
     void *pack = malloc(size);
     if (pack) {
       memcpy(pack, ptr, size);
       sobj->pack = pack;
-      sobj->ty   = ty;
+      sobj->ty = ty;
       sobj->size = size;
     } else {
-      PyObject_DEL((PyObject *) sobj);
+      PyObject_DEL((PyObject *)sobj);
       sobj = 0;
     }
   }
-  return (PyObject *) sobj;
+  return (PyObject *)sobj;
 }
 
-SWIGRUNTIME swig_type_info *
-SwigPyPacked_UnpackData(PyObject *obj, void *ptr, size_t size)
-{
+SWIGRUNTIME swig_type_info *SwigPyPacked_UnpackData(PyObject *obj, void *ptr,
+                                                    size_t size) {
   if (SwigPyPacked_Check(obj)) {
     SwigPyPacked *sobj = (SwigPyPacked *)obj;
-    if (sobj->size != size) return 0;
+    if (sobj->size != size)
+      return 0;
     memcpy(ptr, sobj->pack, size);
     return sobj->ty;
   } else {
@@ -2216,19 +2287,16 @@ SwigPyPacked_UnpackData(PyObject *obj, void *ptr, size_t size)
 
 /* -----------------------------------------------------------------------------
  * pointers/data manipulation
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
-SWIGRUNTIMEINLINE PyObject *
-_SWIG_This(void)
-{
-    return SWIG_Python_str_FromChar("this");
+SWIGRUNTIMEINLINE PyObject *_SWIG_This(void) {
+  return SWIG_Python_str_FromChar("this");
 }
 
 static PyObject *swig_this = NULL;
 
-SWIGRUNTIME PyObject *
-SWIG_This(void)
-{
+SWIGRUNTIME PyObject *SWIG_This(void) {
   if (swig_this == NULL)
     swig_this = _SWIG_This();
   return swig_this;
@@ -2237,27 +2305,25 @@ SWIG_This(void)
 /* #define SWIG_PYTHON_SLOW_GETSET_THIS */
 
 /* TODO: I don't know how to implement the fast getset in Python 3 right now */
-#if PY_VERSION_HEX>=0x03000000
-#define SWIG_PYTHON_SLOW_GETSET_THIS 
+#if PY_VERSION_HEX >= 0x03000000
+#define SWIG_PYTHON_SLOW_GETSET_THIS
 #endif
 
-SWIGRUNTIME SwigPyObject *
-SWIG_Python_GetSwigThis(PyObject *pyobj) 
-{
+SWIGRUNTIME SwigPyObject *SWIG_Python_GetSwigThis(PyObject *pyobj) {
   PyObject *obj;
 
   if (SwigPyObject_Check(pyobj))
-    return (SwigPyObject *) pyobj;
+    return (SwigPyObject *)pyobj;
 
 #ifdef SWIGPYTHON_BUILTIN
   (void)obj;
-# ifdef PyWeakref_CheckProxy
+#ifdef PyWeakref_CheckProxy
   if (PyWeakref_CheckProxy(pyobj)) {
     pyobj = PyWeakref_GET_OBJECT(pyobj);
     if (pyobj && SwigPyObject_Check(pyobj))
-      return (SwigPyObject*) pyobj;
+      return (SwigPyObject *)pyobj;
   }
-# endif
+#endif
   return NULL;
 #else
 
@@ -2265,7 +2331,7 @@ SWIG_Python_GetSwigThis(PyObject *pyobj)
 
 #if (!defined(SWIG_PYTHON_SLOW_GETSET_THIS) && (PY_VERSION_HEX >= 0x02030000))
   if (PyInstance_Check(pyobj)) {
-    obj = _PyInstance_Lookup(pyobj, SWIG_This());      
+    obj = _PyInstance_Lookup(pyobj, SWIG_This());
   } else {
     PyObject **dictptr = _PyObject_GetDictPtr(pyobj);
     if (dictptr != NULL) {
@@ -2274,31 +2340,33 @@ SWIG_Python_GetSwigThis(PyObject *pyobj)
     } else {
 #ifdef PyWeakref_CheckProxy
       if (PyWeakref_CheckProxy(pyobj)) {
-	PyObject *wobj = PyWeakref_GET_OBJECT(pyobj);
-	return wobj ? SWIG_Python_GetSwigThis(wobj) : 0;
+        PyObject *wobj = PyWeakref_GET_OBJECT(pyobj);
+        return wobj ? SWIG_Python_GetSwigThis(wobj) : 0;
       }
 #endif
-      obj = PyObject_GetAttr(pyobj,SWIG_This());
+      obj = PyObject_GetAttr(pyobj, SWIG_This());
       if (obj) {
-	Py_DECREF(obj);
+        Py_DECREF(obj);
       } else {
-	if (PyErr_Occurred()) PyErr_Clear();
-	return 0;
+        if (PyErr_Occurred())
+          PyErr_Clear();
+        return 0;
       }
     }
   }
 #else
-  obj = PyObject_GetAttr(pyobj,SWIG_This());
+  obj = PyObject_GetAttr(pyobj, SWIG_This());
   if (obj) {
     Py_DECREF(obj);
   } else {
-    if (PyErr_Occurred()) PyErr_Clear();
+    if (PyErr_Occurred())
+      PyErr_Clear();
     return 0;
   }
 #endif
   if (obj && !SwigPyObject_Check(obj)) {
     /* a PyObject is called 'this', try to get the 'real this'
-       SwigPyObject from it */ 
+       SwigPyObject from it */
     return SWIG_Python_GetSwigThis(obj);
   }
   return (SwigPyObject *)obj;
@@ -2307,8 +2375,7 @@ SWIG_Python_GetSwigThis(PyObject *pyobj)
 
 /* Acquire a pointer value */
 
-SWIGRUNTIME int
-SWIG_Python_AcquirePtr(PyObject *obj, int own) {
+SWIGRUNTIME int SWIG_Python_AcquirePtr(PyObject *obj, int own) {
   if (own == SWIG_POINTER_OWN) {
     SwigPyObject *sobj = SWIG_Python_GetSwigThis(obj);
     if (sobj) {
@@ -2322,8 +2389,9 @@ SWIG_Python_AcquirePtr(PyObject *obj, int own) {
 
 /* Convert a pointer value */
 
-SWIGRUNTIME int
-SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int flags, int *own) {
+SWIGRUNTIME int SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr,
+                                             swig_type_info *ty, int flags,
+                                             int *own) {
   int res;
   SwigPyObject *sobj;
   int implicit_conv = (flags & SWIG_POINTER_IMPLICIT_CONV) != 0;
@@ -2347,18 +2415,20 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int
       swig_type_info *to = sobj->ty;
       if (to == ty) {
         /* no type cast needed */
-        if (ptr) *ptr = vptr;
+        if (ptr)
+          *ptr = vptr;
         break;
       } else {
-        swig_cast_info *tc = SWIG_TypeCheck(to->name,ty);
+        swig_cast_info *tc = SWIG_TypeCheck(to->name, ty);
         if (!tc) {
           sobj = (SwigPyObject *)sobj->next;
         } else {
           if (ptr) {
             int newmemory = 0;
-            *ptr = SWIG_TypeCast(tc,vptr,&newmemory);
+            *ptr = SWIG_TypeCast(tc, vptr, &newmemory);
             if (newmemory == SWIG_CAST_NEW_MEMORY) {
-              assert(own); /* badly formed typemap which will lead to a memory leak - it must set and use own to delete *ptr */
+              assert(own); /* badly formed typemap which will lead to a memory
+                              leak - it must set and use own to delete *ptr */
               if (own)
                 *own = *own | SWIG_CAST_NEW_MEMORY;
             }
@@ -2367,7 +2437,8 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int
         }
       }
     } else {
-      if (ptr) *ptr = vptr;
+      if (ptr)
+        *ptr = vptr;
       break;
     }
   }
@@ -2380,12 +2451,13 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int
     res = SWIG_OK;
   } else {
     if (implicit_conv) {
-      SwigPyClientData *data = ty ? (SwigPyClientData *) ty->clientdata : 0;
+      SwigPyClientData *data = ty ? (SwigPyClientData *)ty->clientdata : 0;
       if (data && !data->implicitconv) {
         PyObject *klass = data->klass;
         if (klass) {
           PyObject *impconv;
-          data->implicitconv = 1; /* avoid recursion and call 'explicit' constructors*/
+          data->implicitconv =
+              1; /* avoid recursion and call 'explicit' constructors*/
           impconv = SWIG_Python_CallFunctor(klass, obj);
           data->implicitconv = 0;
           if (PyErr_Occurred()) {
@@ -2396,7 +2468,8 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int
             SwigPyObject *iobj = SWIG_Python_GetSwigThis(impconv);
             if (iobj) {
               void *vptr;
-              res = SWIG_Python_ConvertPtrAndOwn((PyObject*)iobj, &vptr, ty, 0, 0);
+              res = SWIG_Python_ConvertPtrAndOwn((PyObject *)iobj, &vptr, ty, 0,
+                                                 0);
               if (SWIG_IsOK(res)) {
                 if (ptr) {
                   *ptr = vptr;
@@ -2405,7 +2478,7 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int
                   res = SWIG_AddCast(res);
                   res = SWIG_AddNewMask(res);
                 } else {
-                  res = SWIG_AddCast(res);		    
+                  res = SWIG_AddCast(res);
                 }
               }
             }
@@ -2427,25 +2500,25 @@ SWIG_Python_ConvertPtrAndOwn(PyObject *obj, void **ptr, swig_type_info *ty, int
 
 /* Convert a function ptr value */
 
-SWIGRUNTIME int
-SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr, swig_type_info *ty) {
+SWIGRUNTIME int SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr,
+                                               swig_type_info *ty) {
   if (!PyCFunction_Check(obj)) {
     return SWIG_ConvertPtr(obj, ptr, ty, 0);
   } else {
     void *vptr = 0;
-    
+
     /* here we get the method pointer for callbacks */
-    const char *doc = (((PyCFunctionObject *)obj) -> m_ml -> ml_doc);
+    const char *doc = (((PyCFunctionObject *)obj)->m_ml->ml_doc);
     const char *desc = doc ? strstr(doc, "swig_ptr: ") : 0;
     if (desc)
       desc = ty ? SWIG_UnpackVoidPtr(desc + 10, &vptr, ty->name) : 0;
-    if (!desc) 
+    if (!desc)
       return SWIG_ERROR;
     if (ty) {
-      swig_cast_info *tc = SWIG_TypeCheck(desc,ty);
+      swig_cast_info *tc = SWIG_TypeCheck(desc, ty);
       if (tc) {
         int newmemory = 0;
-        *ptr = SWIG_TypeCast(tc,vptr,&newmemory);
+        *ptr = SWIG_TypeCast(tc, vptr, &newmemory);
         assert(!newmemory); /* newmemory handling not yet implemented */
       } else {
         return SWIG_ERROR;
@@ -2459,32 +2532,34 @@ SWIG_Python_ConvertFunctionPtr(PyObject *obj, void **ptr, swig_type_info *ty) {
 
 /* Convert a packed value value */
 
-SWIGRUNTIME int
-SWIG_Python_ConvertPacked(PyObject *obj, void *ptr, size_t sz, swig_type_info *ty) {
+SWIGRUNTIME int SWIG_Python_ConvertPacked(PyObject *obj, void *ptr, size_t sz,
+                                          swig_type_info *ty) {
   swig_type_info *to = SwigPyPacked_UnpackData(obj, ptr, sz);
-  if (!to) return SWIG_ERROR;
+  if (!to)
+    return SWIG_ERROR;
   if (ty) {
     if (to != ty) {
       /* check type cast? */
-      swig_cast_info *tc = SWIG_TypeCheck(to->name,ty);
-      if (!tc) return SWIG_ERROR;
+      swig_cast_info *tc = SWIG_TypeCheck(to->name, ty);
+      if (!tc)
+        return SWIG_ERROR;
     }
   }
   return SWIG_OK;
-}  
+}
 
 /* -----------------------------------------------------------------------------
  * Create a new pointer object
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
 /*
   Create a new instance object, without calling __init__, and set the
   'this' attribute.
 */
 
-SWIGRUNTIME PyObject* 
-SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this)
-{
+SWIGRUNTIME PyObject *SWIG_Python_NewShadowInstance(SwigPyClientData *data,
+                                                    PyObject *swig_this) {
 #if (PY_VERSION_HEX >= 0x02020000)
   PyObject *inst = 0;
   PyObject *newraw = data->newraw;
@@ -2494,12 +2569,12 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this)
 #if !defined(SWIG_PYTHON_SLOW_GETSET_THIS)
       PyObject **dictptr = _PyObject_GetDictPtr(inst);
       if (dictptr != NULL) {
-	PyObject *dict = *dictptr;
-	if (dict == NULL) {
-	  dict = PyDict_New();
-	  *dictptr = dict;
-	  PyDict_SetItem(dict, SWIG_This(), swig_this);
-	}
+        PyObject *dict = *dictptr;
+        if (dict == NULL) {
+          dict = PyDict_New();
+          *dictptr = dict;
+          PyDict_SetItem(dict, SWIG_This(), swig_this);
+        }
       }
 #else
       PyObject *key = SWIG_This();
@@ -2508,7 +2583,8 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this)
     }
   } else {
 #if PY_VERSION_HEX >= 0x03000000
-    inst = ((PyTypeObject*) data->newargs)->tp_new((PyTypeObject*) data->newargs, Py_None, Py_None);
+    inst = ((PyTypeObject *)data->newargs)
+               ->tp_new((PyTypeObject *)data->newargs, Py_None, Py_None);
     if (inst) {
       PyObject_SetAttr(inst, SWIG_This(), swig_this);
       Py_TYPE(inst)->tp_flags &= ~Py_TPFLAGS_VALID_VERSION_TAG;
@@ -2532,7 +2608,7 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this)
     inst = PyInstance_NewRaw(data->newargs, dict);
     Py_DECREF(dict);
   }
-  return (PyObject *) inst;
+  return (PyObject *)inst;
 #else
   PyInstanceObject *inst = PyObject_NEW(PyInstanceObject, &PyInstance_Type);
   if (inst == NULL) {
@@ -2552,42 +2628,38 @@ SWIG_Python_NewShadowInstance(SwigPyClientData *data, PyObject *swig_this)
   PyObject_GC_Init(inst);
 #endif
   PyDict_SetItem(inst->in_dict, SWIG_This(), swig_this);
-  return (PyObject *) inst;
+  return (PyObject *)inst;
 #endif
 #endif
 }
 
-SWIGRUNTIME void
-SWIG_Python_SetSwigThis(PyObject *inst, PyObject *swig_this)
-{
- PyObject *dict;
+SWIGRUNTIME void SWIG_Python_SetSwigThis(PyObject *inst, PyObject *swig_this) {
+  PyObject *dict;
 #if (PY_VERSION_HEX >= 0x02020000) && !defined(SWIG_PYTHON_SLOW_GETSET_THIS)
- PyObject **dictptr = _PyObject_GetDictPtr(inst);
- if (dictptr != NULL) {
-   dict = *dictptr;
-   if (dict == NULL) {
-     dict = PyDict_New();
-     *dictptr = dict;
-   }
-   PyDict_SetItem(dict, SWIG_This(), swig_this);
-   return;
- }
-#endif
- dict = PyObject_GetAttrString(inst, (char*)"__dict__");
- PyDict_SetItem(dict, SWIG_This(), swig_this);
- Py_DECREF(dict);
-} 
-
+  PyObject **dictptr = _PyObject_GetDictPtr(inst);
+  if (dictptr != NULL) {
+    dict = *dictptr;
+    if (dict == NULL) {
+      dict = PyDict_New();
+      *dictptr = dict;
+    }
+    PyDict_SetItem(dict, SWIG_This(), swig_this);
+    return;
+  }
+#endif
+  dict = PyObject_GetAttrString(inst, (char *)"__dict__");
+  PyDict_SetItem(dict, SWIG_This(), swig_this);
+  Py_DECREF(dict);
+}
 
-SWIGINTERN PyObject *
-SWIG_Python_InitShadowInstance(PyObject *args) {
+SWIGINTERN PyObject *SWIG_Python_InitShadowInstance(PyObject *args) {
   PyObject *obj[2];
   if (!SWIG_Python_UnpackTuple(args, "swiginit", 2, 2, obj)) {
     return NULL;
   } else {
     SwigPyObject *sthis = SWIG_Python_GetSwigThis(obj[0]);
     if (sthis) {
-      SwigPyObject_append((PyObject*) sthis, obj[1]);
+      SwigPyObject_append((PyObject *)sthis, obj[1]);
     } else {
       SWIG_Python_SetSwigThis(obj[0], obj[1]);
     }
@@ -2597,10 +2669,11 @@ SWIG_Python_InitShadowInstance(PyObject *args) {
 
 /* Create a new pointer object */
 
-SWIGRUNTIME PyObject *
-SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int flags) {
+SWIGRUNTIME PyObject *SWIG_Python_NewPointerObj(PyObject *self, void *ptr,
+                                                swig_type_info *type,
+                                                int flags) {
   SwigPyClientData *clientdata;
-  PyObject * robj;
+  PyObject *robj;
   int own;
 
   if (!ptr)
@@ -2611,11 +2684,12 @@ SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int f
   if (clientdata && clientdata->pytype) {
     SwigPyObject *newobj;
     if (flags & SWIG_BUILTIN_TP_INIT) {
-      newobj = (SwigPyObject*) self;
+      newobj = (SwigPyObject *)self;
       if (newobj->ptr) {
-        PyObject *next_self = clientdata->pytype->tp_alloc(clientdata->pytype, 0);
+        PyObject *next_self =
+            clientdata->pytype->tp_alloc(clientdata->pytype, 0);
         while (newobj->next)
-	  newobj = (SwigPyObject *) newobj->next;
+          newobj = (SwigPyObject *)newobj->next;
         newobj->next = next_self;
         newobj = (SwigPyObject *)next_self;
 #ifdef SWIGPYTHON_BUILTIN
@@ -2633,7 +2707,7 @@ SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int f
       newobj->ty = type;
       newobj->own = own;
       newobj->next = 0;
-      return (PyObject*) newobj;
+      return (PyObject *)newobj;
     }
     return SWIG_Py_Void();
   }
@@ -2651,13 +2725,13 @@ SWIG_Python_NewPointerObj(PyObject *self, void *ptr, swig_type_info *type, int f
 
 /* Create a new packed object */
 
-SWIGRUNTIMEINLINE PyObject *
-SWIG_Python_NewPackedObj(void *ptr, size_t sz, swig_type_info *type) {
-  return ptr ? SwigPyPacked_New((void *) ptr, sz, type) : SWIG_Py_Void();
+SWIGRUNTIMEINLINE PyObject *SWIG_Python_NewPackedObj(void *ptr, size_t sz,
+                                                     swig_type_info *type) {
+  return ptr ? SwigPyPacked_New((void *)ptr, sz, type) : SWIG_Py_Void();
 }
 
 /* -----------------------------------------------------------------------------*
- *  Get type list 
+ *  Get type list
  * -----------------------------------------------------------------------------*/
 
 #ifdef SWIG_LINK_RUNTIME
@@ -2672,42 +2746,43 @@ SWIG_Python_GetModule(void *SWIGUNUSEDPARM(clientdata)) {
 #ifdef SWIG_LINK_RUNTIME
     type_pointer = SWIG_ReturnGlobalTypeList((void *)0);
 #else
-# ifdef SWIGPY_USE_CAPSULE
+#ifdef SWIGPY_USE_CAPSULE
     type_pointer = PyCapsule_Import(SWIGPY_CAPSULE_NAME, 0);
-# else
-    type_pointer = PyCObject_Import((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION,
-				    (char*)"type_pointer" SWIG_TYPE_TABLE_NAME);
-# endif
+#else
+    type_pointer =
+        PyCObject_Import((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION,
+                         (char *)"type_pointer" SWIG_TYPE_TABLE_NAME);
+#endif
     if (PyErr_Occurred()) {
       PyErr_Clear();
       type_pointer = (void *)0;
     }
 #endif
   }
-  return (swig_module_info *) type_pointer;
+  return (swig_module_info *)type_pointer;
 }
 
 #if PY_MAJOR_VERSION < 2
-/* PyModule_AddObject function was introduced in Python 2.0.  The following function
-   is copied out of Python/modsupport.c in python version 2.3.4 */
-SWIGINTERN int
-PyModule_AddObject(PyObject *m, char *name, PyObject *o)
-{
+/* PyModule_AddObject function was introduced in Python 2.0.  The following
+   function is copied out of Python/modsupport.c in python version 2.3.4 */
+SWIGINTERN int PyModule_AddObject(PyObject *m, char *name, PyObject *o) {
   PyObject *dict;
   if (!PyModule_Check(m)) {
-    PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs module as first arg");
+    PyErr_SetString(PyExc_TypeError,
+                    "PyModule_AddObject() needs module as first arg");
     return SWIG_ERROR;
   }
   if (!o) {
-    PyErr_SetString(PyExc_TypeError, "PyModule_AddObject() needs non-NULL value");
+    PyErr_SetString(PyExc_TypeError,
+                    "PyModule_AddObject() needs non-NULL value");
     return SWIG_ERROR;
   }
-  
+
   dict = PyModule_GetDict(m);
   if (dict == NULL) {
     /* Internal error -- modules must have a dict! */
     PyErr_Format(PyExc_SystemError, "module '%s' has no __dict__",
-		 PyModule_GetName(m));
+                 PyModule_GetName(m));
     return SWIG_ERROR;
   }
   if (PyDict_SetItemString(dict, name, o))
@@ -2725,43 +2800,52 @@ SWIG_Python_DestroyModule(void *vptr)
 #endif
 {
 #ifdef SWIGPY_USE_CAPSULE
-  swig_module_info *swig_module = (swig_module_info *) PyCapsule_GetPointer(obj, SWIGPY_CAPSULE_NAME);
+  swig_module_info *swig_module =
+      (swig_module_info *)PyCapsule_GetPointer(obj, SWIGPY_CAPSULE_NAME);
 #else
-  swig_module_info *swig_module = (swig_module_info *) vptr;
+  swig_module_info *swig_module = (swig_module_info *)vptr;
 #endif
   swig_type_info **types = swig_module->types;
   size_t i;
-  for (i =0; i < swig_module->size; ++i) {
+  for (i = 0; i < swig_module->size; ++i) {
     swig_type_info *ty = types[i];
     if (ty->owndata) {
-      SwigPyClientData *data = (SwigPyClientData *) ty->clientdata;
-      if (data) SwigPyClientData_Del(data);
+      SwigPyClientData *data = (SwigPyClientData *)ty->clientdata;
+      if (data)
+        SwigPyClientData_Del(data);
     }
   }
   Py_DECREF(SWIG_This());
   swig_this = NULL;
 }
 
-SWIGRUNTIME void
-SWIG_Python_SetModule(swig_module_info *swig_module) {
+SWIGRUNTIME void SWIG_Python_SetModule(swig_module_info *swig_module) {
 #if PY_VERSION_HEX >= 0x03000000
- /* Add a dummy module object into sys.modules */
-  PyObject *module = PyImport_AddModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION);
+  /* Add a dummy module object into sys.modules */
+  PyObject *module =
+      PyImport_AddModule((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION);
 #else
-  static PyMethodDef swig_empty_runtime_method_table[] = { {NULL, NULL, 0, NULL} }; /* Sentinel */
-  PyObject *module = Py_InitModule((char*)"swig_runtime_data" SWIG_RUNTIME_VERSION, swig_empty_runtime_method_table);
+  static PyMethodDef swig_empty_runtime_method_table[] = {
+      {NULL, NULL, 0, NULL}}; /* Sentinel */
+  PyObject *module =
+      Py_InitModule((char *)"swig_runtime_data" SWIG_RUNTIME_VERSION,
+                    swig_empty_runtime_method_table);
 #endif
 #ifdef SWIGPY_USE_CAPSULE
-  PyObject *pointer = PyCapsule_New((void *) swig_module, SWIGPY_CAPSULE_NAME, SWIG_Python_DestroyModule);
+  PyObject *pointer = PyCapsule_New((void *)swig_module, SWIGPY_CAPSULE_NAME,
+                                    SWIG_Python_DestroyModule);
   if (pointer && module) {
-    PyModule_AddObject(module, (char*)"type_pointer_capsule" SWIG_TYPE_TABLE_NAME, pointer);
+    PyModule_AddObject(
+        module, (char *)"type_pointer_capsule" SWIG_TYPE_TABLE_NAME, pointer);
   } else {
     Py_XDECREF(pointer);
   }
 #else
-  PyObject *pointer = PyCObject_FromVoidPtr((void *) swig_module, SWIG_Python_DestroyModule);
+  PyObject *pointer =
+      PyCObject_FromVoidPtr((void *)swig_module, SWIG_Python_DestroyModule);
   if (pointer && module) {
-    PyModule_AddObject(module, (char*)"type_pointer" SWIG_TYPE_TABLE_NAME, pointer);
+    PyModule_AddObject(module, (char *)"type_pointer" SWIG_TYPE_TABLE_NAME,
+                       pointer);
   } else {
     Py_XDECREF(pointer);
   }
@@ -2769,31 +2853,28 @@ SWIG_Python_SetModule(swig_module_info *swig_module) {
 }
 
 /* The python cached type query */
-SWIGRUNTIME PyObject *
-SWIG_Python_TypeCache(void) {
+SWIGRUNTIME PyObject *SWIG_Python_TypeCache(void) {
   static PyObject *SWIG_STATIC_POINTER(cache) = PyDict_New();
   return cache;
 }
 
-SWIGRUNTIME swig_type_info *
-SWIG_Python_TypeQuery(const char *type)
-{
+SWIGRUNTIME swig_type_info *SWIG_Python_TypeQuery(const char *type) {
   PyObject *cache = SWIG_Python_TypeCache();
-  PyObject *key = SWIG_Python_str_FromChar(type); 
+  PyObject *key = SWIG_Python_str_FromChar(type);
   PyObject *obj = PyDict_GetItem(cache, key);
   swig_type_info *descriptor;
   if (obj) {
 #ifdef SWIGPY_USE_CAPSULE
-    descriptor = (swig_type_info *) PyCapsule_GetPointer(obj, NULL);
+    descriptor = (swig_type_info *)PyCapsule_GetPointer(obj, NULL);
 #else
-    descriptor = (swig_type_info *) PyCObject_AsVoidPtr(obj);
+    descriptor = (swig_type_info *)PyCObject_AsVoidPtr(obj);
 #endif
   } else {
     swig_module_info *swig_module = SWIG_GetModule(0);
     descriptor = SWIG_TypeQueryModule(swig_module, swig_module, type);
     if (descriptor) {
 #ifdef SWIGPY_USE_CAPSULE
-      obj = PyCapsule_New((void*) descriptor, NULL, NULL);
+      obj = PyCapsule_New((void *)descriptor, NULL, NULL);
 #else
       obj = PyCObject_FromVoidPtr(descriptor, NULL);
 #endif
@@ -2805,16 +2886,15 @@ SWIG_Python_TypeQuery(const char *type)
   return descriptor;
 }
 
-/* 
+/*
    For backward compatibility only
 */
-#define SWIG_POINTER_EXCEPTION  0
-#define SWIG_arg_fail(arg)      SWIG_Python_ArgFail(arg)
-#define SWIG_MustGetPtr(p, type, argnum, flags)  SWIG_Python_MustGetPtr(p, type, argnum, flags)
+#define SWIG_POINTER_EXCEPTION 0
+#define SWIG_arg_fail(arg) SWIG_Python_ArgFail(arg)
+#define SWIG_MustGetPtr(p, type, argnum, flags)                                \
+  SWIG_Python_MustGetPtr(p, type, argnum, flags)
 
-SWIGRUNTIME int
-SWIG_Python_AddErrMesg(const char* mesg, int infront)
-{  
+SWIGRUNTIME int SWIG_Python_AddErrMesg(const char *mesg, int infront) {
   if (PyErr_Occurred()) {
     PyObject *type = 0;
     PyObject *value = 0;
@@ -2826,9 +2906,11 @@ SWIG_Python_AddErrMesg(const char* mesg, int infront)
       Py_XINCREF(type);
       PyErr_Clear();
       if (infront) {
-	PyErr_Format(type, "%s %s", mesg, tmp = SWIG_Python_str_AsChar(old_str));
+        PyErr_Format(type, "%s %s", mesg,
+                     tmp = SWIG_Python_str_AsChar(old_str));
       } else {
-	PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str), mesg);
+        PyErr_Format(type, "%s %s", tmp = SWIG_Python_str_AsChar(old_str),
+                     mesg);
       }
       SWIG_Python_str_DelForPy3(tmp);
       Py_DECREF(old_str);
@@ -2838,10 +2920,8 @@ SWIG_Python_AddErrMesg(const char* mesg, int infront)
     return 0;
   }
 }
-  
-SWIGRUNTIME int
-SWIG_Python_ArgFail(int argnum)
-{
+
+SWIGRUNTIME int SWIG_Python_ArgFail(int argnum) {
   if (PyErr_Occurred()) {
     /* add information about failing argument */
     char mesg[256];
@@ -2852,55 +2932,53 @@ SWIG_Python_ArgFail(int argnum)
   }
 }
 
-SWIGRUNTIMEINLINE const char *
-SwigPyObject_GetDesc(PyObject *self)
-{
+SWIGRUNTIMEINLINE const char *SwigPyObject_GetDesc(PyObject *self) {
   SwigPyObject *v = (SwigPyObject *)self;
   swig_type_info *ty = v ? v->ty : 0;
   return ty ? ty->str : "";
 }
 
-SWIGRUNTIME void
-SWIG_Python_TypeError(const char *type, PyObject *obj)
-{
+SWIGRUNTIME void SWIG_Python_TypeError(const char *type, PyObject *obj) {
   if (type) {
 #if defined(SWIG_COBJECT_TYPES)
     if (obj && SwigPyObject_Check(obj)) {
-      const char *otype = (const char *) SwigPyObject_GetDesc(obj);
+      const char *otype = (const char *)SwigPyObject_GetDesc(obj);
       if (otype) {
-	PyErr_Format(PyExc_TypeError, "a '%s' is expected, 'SwigPyObject(%s)' is received",
-		     type, otype);
-	return;
+        PyErr_Format(PyExc_TypeError,
+                     "a '%s' is expected, 'SwigPyObject(%s)' is received", type,
+                     otype);
+        return;
       }
-    } else 
-#endif      
+    } else
+#endif
     {
-      const char *otype = (obj ? obj->ob_type->tp_name : 0); 
+      const char *otype = (obj ? obj->ob_type->tp_name : 0);
       if (otype) {
-	PyObject *str = PyObject_Str(obj);
-	const char *cstr = str ? SWIG_Python_str_AsChar(str) : 0;
-	if (cstr) {
-	  PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s(%s)' is received",
-		       type, otype, cstr);
+        PyObject *str = PyObject_Str(obj);
+        const char *cstr = str ? SWIG_Python_str_AsChar(str) : 0;
+        if (cstr) {
+          PyErr_Format(PyExc_TypeError,
+                       "a '%s' is expected, '%s(%s)' is received", type, otype,
+                       cstr);
           SWIG_Python_str_DelForPy3(cstr);
-	} else {
-	  PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s' is received",
-		       type, otype);
-	}
-	Py_XDECREF(str);
-	return;
+        } else {
+          PyErr_Format(PyExc_TypeError, "a '%s' is expected, '%s' is received",
+                       type, otype);
+        }
+        Py_XDECREF(str);
+        return;
       }
-    }   
+    }
     PyErr_Format(PyExc_TypeError, "a '%s' is expected", type);
   } else {
     PyErr_Format(PyExc_TypeError, "unexpected type is received");
   }
 }
 
-
 /* Convert a pointer value, signal an exception on a type mismatch */
-SWIGRUNTIME void *
-SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty, int SWIGUNUSEDPARM(argnum), int flags) {
+SWIGRUNTIME void *SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty,
+                                         int SWIGUNUSEDPARM(argnum),
+                                         int flags) {
   void *result;
   if (SWIG_Python_ConvertPtr(obj, &result, ty, flags) == -1) {
     PyErr_Clear();
@@ -2915,25 +2993,27 @@ SWIG_Python_MustGetPtr(PyObject *obj, swig_type_info *ty, int SWIGUNUSEDPARM(arg
 }
 
 #ifdef SWIGPYTHON_BUILTIN
-SWIGRUNTIME int
-SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) {
+SWIGRUNTIME int SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name,
+                                              PyObject *value) {
   PyTypeObject *tp = obj->ob_type;
   PyObject *descr;
   PyObject *encoded_name;
   descrsetfunc f;
   int res = -1;
 
-# ifdef Py_USING_UNICODE
+#ifdef Py_USING_UNICODE
   if (PyString_Check(name)) {
-    name = PyUnicode_Decode(PyString_AsString(name), PyString_Size(name), NULL, NULL);
+    name = PyUnicode_Decode(PyString_AsString(name), PyString_Size(name), NULL,
+                            NULL);
     if (!name)
       return -1;
   } else if (!PyUnicode_Check(name))
-# else
+#else
   if (!PyString_Check(name))
-# endif
+#endif
   {
-    PyErr_Format(PyExc_TypeError, "attribute name must be string, not '%.200s'", name->ob_type->tp_name);
+    PyErr_Format(PyExc_TypeError, "attribute name must be string, not '%.200s'",
+                 name->ob_type->tp_name);
     return -1;
   } else {
     Py_INCREF(name);
@@ -2955,30 +3035,35 @@ SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) {
     } else {
       encoded_name = PyUnicode_AsUTF8String(name);
     }
-    PyErr_Format(PyExc_AttributeError, "'%.100s' object has no attribute '%.200s'", tp->tp_name, PyString_AsString(encoded_name));
+    PyErr_Format(PyExc_AttributeError,
+                 "'%.100s' object has no attribute '%.200s'", tp->tp_name,
+                 PyString_AsString(encoded_name));
     Py_DECREF(encoded_name);
   } else {
     res = f(descr, obj, value);
   }
-  
-  done:
+
+done:
   Py_DECREF(name);
   return res;
 }
 #endif
 
-
 #ifdef __cplusplus
 }
 #endif
 
+#define SWIG_exception_fail(code, msg)                                         \
+  do {                                                                         \
+    SWIG_Error(code, msg);                                                     \
+    SWIG_fail;                                                                 \
+  } while (0)
 
-
-#define SWIG_exception_fail(code, msg) do { SWIG_Error(code, msg); SWIG_fail; } while(0) 
-
-#define SWIG_contract_assert(expr, msg) if (!(expr)) { SWIG_Error(SWIG_RuntimeError, msg); SWIG_fail; } else 
-
-
+#define SWIG_contract_assert(expr, msg)                                        \
+  if (!(expr)) {                                                               \
+    SWIG_Error(SWIG_RuntimeError, msg);                                        \
+    SWIG_fail;                                                                 \
+  } else
 
 /* -------- TYPES TABLE (BEGIN) -------- */
 
@@ -2986,41 +3071,40 @@ SWIG_Python_NonDynamicSetAttr(PyObject *obj, PyObject *name, PyObject *value) {
 #define SWIGTYPE_p_float swig_types[1]
 static swig_type_info *swig_types[3];
 static swig_module_info swig_module = {swig_types, 2, 0, 0, 0, 0};
-#define SWIG_TypeQuery(name) SWIG_TypeQueryModule(&swig_module, &swig_module, name)
-#define SWIG_MangledTypeQuery(name) SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name)
+#define SWIG_TypeQuery(name)                                                   \
+  SWIG_TypeQueryModule(&swig_module, &swig_module, name)
+#define SWIG_MangledTypeQuery(name)                                            \
+  SWIG_MangledTypeQueryModule(&swig_module, &swig_module, name)
 
 /* -------- TYPES TABLE (END) -------- */
 
 #if (PY_VERSION_HEX <= 0x02000000)
-# if !defined(SWIG_PYTHON_CLASSIC)
-#  error "This python version requires swig to be run with the '-classic' option"
-# endif
+#if !defined(SWIG_PYTHON_CLASSIC)
+#error "This python version requires swig to be run with the '-classic' option"
+#endif
 #endif
 
 /*-----------------------------------------------
               @(target):= _cmodule.so
   ------------------------------------------------*/
 #if PY_VERSION_HEX >= 0x03000000
-#  define SWIG_init    PyInit__cmodule
+#define SWIG_init PyInit__cmodule
 
 #else
-#  define SWIG_init    init_cmodule
+#define SWIG_init init_cmodule
 
 #endif
-#define SWIG_name    "_cmodule"
+#define SWIG_name "_cmodule"
 
-#define SWIGVERSION 0x030008 
+#define SWIGVERSION 0x030008
 #define SWIG_VERSION SWIGVERSION
 
-
-#define SWIG_as_voidptr(a) (void *)((const void *)(a)) 
-#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a),(void**)(a)) 
-
+#define SWIG_as_voidptr(a) (void *)((const void *)(a))
+#define SWIG_as_voidptrptr(a) ((void)SWIG_as_voidptr(*a), (void **)(a))
 
 #define SWIG_FILE_WITH_INIT
 #include "gamut_map.h"
 
-
 #ifndef SWIG_FILE_WITH_INIT
 #define NO_IMPORT_ARRAY
 #endif
@@ -3028,520 +3112,457 @@ static swig_module_info swig_module = {swig_types, 2, 0, 0, 0, 0};
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #include <numpy/arrayobject.h>
 
-
-void gamut_map_full(float* input, int in_dim1, int in_dim2, int in_dim3,
-                    float* result, int out_dim1, int out_dim2, int out_dim3,
-                    float* ctrl_pts, int cp_dim1, int cp_dim2,
-                    float* weights, int weight_dim1, int weight_dim2,
-                    float* coefs, int coef_dim1, int coef_dim2) {
-    gamut_map(input, in_dim1, in_dim2, in_dim3, result, ctrl_pts, weights, coefs, cp_dim1);
+void gamut_map_full(float *input, int in_dim1, int in_dim2, int in_dim3,
+                    float *result, int out_dim1, int out_dim2, int out_dim3,
+                    float *ctrl_pts, int cp_dim1, int cp_dim2, float *weights,
+                    int weight_dim1, int weight_dim2, float *coefs,
+                    int coef_dim1, int coef_dim2) {
+  gamut_map(input, in_dim1, in_dim2, in_dim3, result, ctrl_pts, weights, coefs,
+            cp_dim1);
 }
 
-
 #if NPY_API_VERSION < 0x00000007
 #define NPY_ARRAY_DEFAULT NPY_DEFAULT
-#define NPY_ARRAY_FARRAY  NPY_FARRAY
-#define NPY_FORTRANORDER  NPY_FORTRAN
+#define NPY_ARRAY_FARRAY NPY_FARRAY
+#define NPY_FORTRANORDER NPY_FORTRAN
 #endif
 
-
 /* Macros to extract array attributes.
  */
 #if NPY_API_VERSION < 0x00000007
-#define is_array(a)            ((a) && PyArray_Check((PyArrayObject*)a))
-#define array_type(a)          (int)(PyArray_TYPE((PyArrayObject*)a))
-#define array_numdims(a)       (((PyArrayObject*)a)->nd)
-#define array_dimensions(a)    (((PyArrayObject*)a)->dimensions)
-#define array_size(a,i)        (((PyArrayObject*)a)->dimensions[i])
-#define array_strides(a)       (((PyArrayObject*)a)->strides)
-#define array_stride(a,i)      (((PyArrayObject*)a)->strides[i])
-#define array_data(a)          (((PyArrayObject*)a)->data)
-#define array_descr(a)         (((PyArrayObject*)a)->descr)
-#define array_flags(a)         (((PyArrayObject*)a)->flags)
-#define array_clearflags(a,f)  (((PyArrayObject*)a)->flags) &= ~f
-#define array_enableflags(a,f) (((PyArrayObject*)a)->flags) = f
-#define array_is_fortran(a)    (PyArray_ISFORTRAN((PyArrayObject*)a))
+#define is_array(a) ((a) && PyArray_Check((PyArrayObject *)a))
+#define array_type(a) (int)(PyArray_TYPE((PyArrayObject *)a))
+#define array_numdims(a) (((PyArrayObject *)a)->nd)
+#define array_dimensions(a) (((PyArrayObject *)a)->dimensions)
+#define array_size(a, i) (((PyArrayObject *)a)->dimensions[i])
+#define array_strides(a) (((PyArrayObject *)a)->strides)
+#define array_stride(a, i) (((PyArrayObject *)a)->strides[i])
+#define array_data(a) (((PyArrayObject *)a)->data)
+#define array_descr(a) (((PyArrayObject *)a)->descr)
+#define array_flags(a) (((PyArrayObject *)a)->flags)
+#define array_clearflags(a, f) (((PyArrayObject *)a)->flags) &= ~f
+#define array_enableflags(a, f) (((PyArrayObject *)a)->flags) = f
+#define array_is_fortran(a) (PyArray_ISFORTRAN((PyArrayObject *)a))
 #else
-#define is_array(a)            ((a) && PyArray_Check(a))
-#define array_type(a)          PyArray_TYPE((PyArrayObject*)a)
-#define array_numdims(a)       PyArray_NDIM((PyArrayObject*)a)
-#define array_dimensions(a)    PyArray_DIMS((PyArrayObject*)a)
-#define array_strides(a)       PyArray_STRIDES((PyArrayObject*)a)
-#define array_stride(a,i)      PyArray_STRIDE((PyArrayObject*)a,i)
-#define array_size(a,i)        PyArray_DIM((PyArrayObject*)a,i)
-#define array_data(a)          PyArray_DATA((PyArrayObject*)a)
-#define array_descr(a)         PyArray_DESCR((PyArrayObject*)a)
-#define array_flags(a)         PyArray_FLAGS((PyArrayObject*)a)
-#define array_enableflags(a,f) PyArray_ENABLEFLAGS((PyArrayObject*)a,f)
-#define array_clearflags(a,f)  PyArray_CLEARFLAGS((PyArrayObject*)a,f)
-#define array_is_fortran(a)    (PyArray_IS_F_CONTIGUOUS((PyArrayObject*)a))
-#endif
-#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject*)a))
-#define array_is_native(a)     (PyArray_ISNOTSWAPPED((PyArrayObject*)a))
-
-
-  /* Given a PyObject, return a string describing its type.
-   */
-  const char* pytype_string(PyObject* py_obj)
-  {
-    if (py_obj == NULL          ) return "C NULL value";
-    if (py_obj == Py_None       ) return "Python None" ;
-    if (PyCallable_Check(py_obj)) return "callable"    ;
-    if (PyString_Check(  py_obj)) return "string"      ;
-    if (PyInt_Check(     py_obj)) return "int"         ;
-    if (PyFloat_Check(   py_obj)) return "float"       ;
-    if (PyDict_Check(    py_obj)) return "dict"        ;
-    if (PyList_Check(    py_obj)) return "list"        ;
-    if (PyTuple_Check(   py_obj)) return "tuple"       ;
+#define is_array(a) ((a) && PyArray_Check(a))
+#define array_type(a) PyArray_TYPE((PyArrayObject *)a)
+#define array_numdims(a) PyArray_NDIM((PyArrayObject *)a)
+#define array_dimensions(a) PyArray_DIMS((PyArrayObject *)a)
+#define array_strides(a) PyArray_STRIDES((PyArrayObject *)a)
+#define array_stride(a, i) PyArray_STRIDE((PyArrayObject *)a, i)
+#define array_size(a, i) PyArray_DIM((PyArrayObject *)a, i)
+#define array_data(a) PyArray_DATA((PyArrayObject *)a)
+#define array_descr(a) PyArray_DESCR((PyArrayObject *)a)
+#define array_flags(a) PyArray_FLAGS((PyArrayObject *)a)
+#define array_enableflags(a, f) PyArray_ENABLEFLAGS((PyArrayObject *)a, f)
+#define array_clearflags(a, f) PyArray_CLEARFLAGS((PyArrayObject *)a, f)
+#define array_is_fortran(a) (PyArray_IS_F_CONTIGUOUS((PyArrayObject *)a))
+#endif
+#define array_is_contiguous(a) (PyArray_ISCONTIGUOUS((PyArrayObject *)a))
+#define array_is_native(a) (PyArray_ISNOTSWAPPED((PyArrayObject *)a))
+
+/* Given a PyObject, return a string describing its type.
+ */
+const char *pytype_string(PyObject *py_obj) {
+  if (py_obj == NULL)
+    return "C NULL value";
+  if (py_obj == Py_None)
+    return "Python None";
+  if (PyCallable_Check(py_obj))
+    return "callable";
+  if (PyString_Check(py_obj))
+    return "string";
+  if (PyInt_Check(py_obj))
+    return "int";
+  if (PyFloat_Check(py_obj))
+    return "float";
+  if (PyDict_Check(py_obj))
+    return "dict";
+  if (PyList_Check(py_obj))
+    return "list";
+  if (PyTuple_Check(py_obj))
+    return "tuple";
 #if PY_MAJOR_VERSION < 3
-    if (PyFile_Check(    py_obj)) return "file"        ;
-    if (PyModule_Check(  py_obj)) return "module"      ;
-    if (PyInstance_Check(py_obj)) return "instance"    ;
+  if (PyFile_Check(py_obj))
+    return "file";
+  if (PyModule_Check(py_obj))
+    return "module";
+  if (PyInstance_Check(py_obj))
+    return "instance";
 #endif
 
-    return "unknown type";
-  }
+  return "unknown type";
+}
 
-  /* Given a NumPy typecode, return a string describing the type.
-   */
-  const char* typecode_string(int typecode)
-  {
-    static const char* type_names[25] = {"bool",
-                                         "byte",
-                                         "unsigned byte",
-                                         "short",
-                                         "unsigned short",
-                                         "int",
-                                         "unsigned int",
-                                         "long",
-                                         "unsigned long",
-                                         "long long",
-                                         "unsigned long long",
-                                         "float",
-                                         "double",
-                                         "long double",
-                                         "complex float",
-                                         "complex double",
-                                         "complex long double",
-                                         "object",
-                                         "string",
-                                         "unicode",
-                                         "void",
-                                         "ntypes",
-                                         "notype",
-                                         "char",
-                                         "unknown"};
-    return typecode < 24 ? type_names[typecode] : type_names[24];
-  }
-
-  /* Make sure input has correct numpy type.  This now just calls
-     PyArray_EquivTypenums().
-   */
-  int type_match(int actual_type,
-                 int desired_type)
-  {
-    return PyArray_EquivTypenums(actual_type, desired_type);
-  }
+/* Given a NumPy typecode, return a string describing the type.
+ */
+const char *typecode_string(int typecode) {
+  static const char *type_names[25] = {"bool",
+                                       "byte",
+                                       "unsigned byte",
+                                       "short",
+                                       "unsigned short",
+                                       "int",
+                                       "unsigned int",
+                                       "long",
+                                       "unsigned long",
+                                       "long long",
+                                       "unsigned long long",
+                                       "float",
+                                       "double",
+                                       "long double",
+                                       "complex float",
+                                       "complex double",
+                                       "complex long double",
+                                       "object",
+                                       "string",
+                                       "unicode",
+                                       "void",
+                                       "ntypes",
+                                       "notype",
+                                       "char",
+                                       "unknown"};
+  return typecode < 24 ? type_names[typecode] : type_names[24];
+}
+
+/* Make sure input has correct numpy type.  This now just calls
+   PyArray_EquivTypenums().
+ */
+int type_match(int actual_type, int desired_type) {
+  return PyArray_EquivTypenums(actual_type, desired_type);
+}
 
 #ifdef SWIGPY_USE_CAPSULE
-  void free_cap(PyObject * cap)
-  {
-    void* array = (void*) PyCapsule_GetPointer(cap,SWIGPY_CAPSULE_NAME);
-    if (array != NULL) free(array);
-  }
+void free_cap(PyObject *cap) {
+  void *array = (void *)PyCapsule_GetPointer(cap, SWIGPY_CAPSULE_NAME);
+  if (array != NULL)
+    free(array);
+}
 #endif
 
-
-
-
-  /* Given a PyObject pointer, cast it to a PyArrayObject pointer if
-   * legal.  If not, set the python error string appropriately and
-   * return NULL.
-   */
-  PyArrayObject* obj_to_array_no_conversion(PyObject* input,
-                                            int        typecode)
-  {
-    PyArrayObject* ary = NULL;
-    if (is_array(input) && (typecode == NPY_NOTYPE ||
-                            PyArray_EquivTypenums(array_type(input), typecode)))
-    {
-      ary = (PyArrayObject*) input;
-    }
-    else if is_array(input)
-    {
-      const char* desired_type = typecode_string(typecode);
-      const char* actual_type  = typecode_string(array_type(input));
+/* Given a PyObject pointer, cast it to a PyArrayObject pointer if
+ * legal.  If not, set the python error string appropriately and
+ * return NULL.
+ */
+PyArrayObject *obj_to_array_no_conversion(PyObject *input, int typecode) {
+  PyArrayObject *ary = NULL;
+  if (is_array(input) && (typecode == NPY_NOTYPE ||
+                          PyArray_EquivTypenums(array_type(input), typecode))) {
+    ary = (PyArrayObject *)input;
+  } else if
+    is_array(input) {
+      const char *desired_type = typecode_string(typecode);
+      const char *actual_type = typecode_string(array_type(input));
       PyErr_Format(PyExc_TypeError,
                    "Array of type '%s' required.  Array of type '%s' given",
                    desired_type, actual_type);
       ary = NULL;
     }
-    else
-    {
-      const char* desired_type = typecode_string(typecode);
-      const char* actual_type  = pytype_string(input);
-      PyErr_Format(PyExc_TypeError,
-                   "Array of type '%s' required.  A '%s' was given",
-                   desired_type,
-                   actual_type);
-      ary = NULL;
-    }
-    return ary;
+  else {
+    const char *desired_type = typecode_string(typecode);
+    const char *actual_type = pytype_string(input);
+    PyErr_Format(PyExc_TypeError,
+                 "Array of type '%s' required.  A '%s' was given", desired_type,
+                 actual_type);
+    ary = NULL;
   }
+  return ary;
+}
 
-  /* Convert the given PyObject to a NumPy array with the given
-   * typecode.  On success, return a valid PyArrayObject* with the
-   * correct type.  On failure, the python error string will be set and
-   * the routine returns NULL.
-   */
-  PyArrayObject* obj_to_array_allow_conversion(PyObject* input,
-                                               int       typecode,
-                                               int*      is_new_object)
-  {
-    PyArrayObject* ary = NULL;
-    PyObject*      py_obj;
-    if (is_array(input) && (typecode == NPY_NOTYPE ||
-                            PyArray_EquivTypenums(array_type(input),typecode)))
-    {
-      ary = (PyArrayObject*) input;
-      *is_new_object = 0;
-    }
-    else
-    {
-      py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT);
-      /* If NULL, PyArray_FromObject will have set python error value.*/
-      ary = (PyArrayObject*) py_obj;
-      *is_new_object = 1;
-    }
-    return ary;
-  }
-
-  /* Given a PyArrayObject, check to see if it is contiguous.  If so,
-   * return the input pointer and flag it as not a new object.  If it is
-   * not contiguous, create a new PyArrayObject using the original data,
-   * flag it as a new object and return the pointer.
-   */
-  PyArrayObject* make_contiguous(PyArrayObject* ary,
-                                 int*           is_new_object,
-                                 int            min_dims,
-                                 int            max_dims)
-  {
-    PyArrayObject* result;
-    if (array_is_contiguous(ary))
-    {
-      result = ary;
-      *is_new_object = 0;
-    }
-    else
-    {
-      result = (PyArrayObject*) PyArray_ContiguousFromObject((PyObject*)ary,
-                                                              array_type(ary),
-                                                              min_dims,
-                                                              max_dims);
-      *is_new_object = 1;
-    }
-    return result;
+/* Convert the given PyObject to a NumPy array with the given
+ * typecode.  On success, return a valid PyArrayObject* with the
+ * correct type.  On failure, the python error string will be set and
+ * the routine returns NULL.
+ */
+PyArrayObject *obj_to_array_allow_conversion(PyObject *input, int typecode,
+                                             int *is_new_object) {
+  PyArrayObject *ary = NULL;
+  PyObject *py_obj;
+  if (is_array(input) && (typecode == NPY_NOTYPE ||
+                          PyArray_EquivTypenums(array_type(input), typecode))) {
+    ary = (PyArrayObject *)input;
+    *is_new_object = 0;
+  } else {
+    py_obj = PyArray_FROMANY(input, typecode, 0, 0, NPY_ARRAY_DEFAULT);
+    /* If NULL, PyArray_FromObject will have set python error value.*/
+    ary = (PyArrayObject *)py_obj;
+    *is_new_object = 1;
   }
+  return ary;
+}
 
-  /* Given a PyArrayObject, check to see if it is Fortran-contiguous.
-   * If so, return the input pointer, but do not flag it as not a new
-   * object.  If it is not Fortran-contiguous, create a new
-   * PyArrayObject using the original data, flag it as a new object
-   * and return the pointer.
-   */
-  PyArrayObject* make_fortran(PyArrayObject* ary,
-                              int*           is_new_object)
-  {
-    PyArrayObject* result;
-    if (array_is_fortran(ary))
-    {
-      result = ary;
-      *is_new_object = 0;
-    }
-    else
-    {
-      Py_INCREF(array_descr(ary));
-      result = (PyArrayObject*) PyArray_FromArray(ary,
-                                                  array_descr(ary),
+/* Given a PyArrayObject, check to see if it is contiguous.  If so,
+ * return the input pointer and flag it as not a new object.  If it is
+ * not contiguous, create a new PyArrayObject using the original data,
+ * flag it as a new object and return the pointer.
+ */
+PyArrayObject *make_contiguous(PyArrayObject *ary, int *is_new_object,
+                               int min_dims, int max_dims) {
+  PyArrayObject *result;
+  if (array_is_contiguous(ary)) {
+    result = ary;
+    *is_new_object = 0;
+  } else {
+    result = (PyArrayObject *)PyArray_ContiguousFromObject(
+        (PyObject *)ary, array_type(ary), min_dims, max_dims);
+    *is_new_object = 1;
+  }
+  return result;
+}
+
+/* Given a PyArrayObject, check to see if it is Fortran-contiguous.
+ * If so, return the input pointer, but do not flag it as not a new
+ * object.  If it is not Fortran-contiguous, create a new
+ * PyArrayObject using the original data, flag it as a new object
+ * and return the pointer.
+ */
+PyArrayObject *make_fortran(PyArrayObject *ary, int *is_new_object) {
+  PyArrayObject *result;
+  if (array_is_fortran(ary)) {
+    result = ary;
+    *is_new_object = 0;
+  } else {
+    Py_INCREF(array_descr(ary));
+    result = (PyArrayObject *)PyArray_FromArray(ary, array_descr(ary),
 #if NPY_API_VERSION < 0x00000007
-                                                  NPY_FORTRANORDER);
+                                                NPY_FORTRANORDER);
 #else
-                                                  NPY_ARRAY_F_CONTIGUOUS);
+                                                NPY_ARRAY_F_CONTIGUOUS);
 #endif
-      *is_new_object = 1;
-    }
-    return result;
+    *is_new_object = 1;
   }
+  return result;
+}
 
-  /* Convert a given PyObject to a contiguous PyArrayObject of the
-   * specified type.  If the input object is not a contiguous
-   * PyArrayObject, a new one will be created and the new object flag
-   * will be set.
-   */
-  PyArrayObject* obj_to_array_contiguous_allow_conversion(PyObject* input,
-                                                          int       typecode,
-                                                          int*      is_new_object)
-  {
-    int is_new1 = 0;
-    int is_new2 = 0;
-    PyArrayObject* ary2;
-    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
-                                                        typecode,
-                                                        &is_new1);
-    if (ary1)
-    {
-      ary2 = make_contiguous(ary1, &is_new2, 0, 0);
-      if ( is_new1 && is_new2)
-      {
-        Py_DECREF(ary1);
-      }
-      ary1 = ary2;
-    }
-    *is_new_object = is_new1 || is_new2;
-    return ary1;
-  }
-
-  /* Convert a given PyObject to a Fortran-ordered PyArrayObject of the
-   * specified type.  If the input object is not a Fortran-ordered
-   * PyArrayObject, a new one will be created and the new object flag
-   * will be set.
-   */
-  PyArrayObject* obj_to_array_fortran_allow_conversion(PyObject* input,
-                                                       int       typecode,
-                                                       int*      is_new_object)
-  {
-    int is_new1 = 0;
-    int is_new2 = 0;
-    PyArrayObject* ary2;
-    PyArrayObject* ary1 = obj_to_array_allow_conversion(input,
-                                                        typecode,
-                                                        &is_new1);
-    if (ary1)
-    {
-      ary2 = make_fortran(ary1, &is_new2);
-      if (is_new1 && is_new2)
-      {
-        Py_DECREF(ary1);
-      }
-      ary1 = ary2;
+/* Convert a given PyObject to a contiguous PyArrayObject of the
+ * specified type.  If the input object is not a contiguous
+ * PyArrayObject, a new one will be created and the new object flag
+ * will be set.
+ */
+PyArrayObject *obj_to_array_contiguous_allow_conversion(PyObject *input,
+                                                        int typecode,
+                                                        int *is_new_object) {
+  int is_new1 = 0;
+  int is_new2 = 0;
+  PyArrayObject *ary2;
+  PyArrayObject *ary1 =
+      obj_to_array_allow_conversion(input, typecode, &is_new1);
+  if (ary1) {
+    ary2 = make_contiguous(ary1, &is_new2, 0, 0);
+    if (is_new1 && is_new2) {
+      Py_DECREF(ary1);
     }
-    *is_new_object = is_new1 || is_new2;
-    return ary1;
+    ary1 = ary2;
   }
+  *is_new_object = is_new1 || is_new2;
+  return ary1;
+}
 
-
-  /* Test whether a python object is contiguous.  If array is
-   * contiguous, return 1.  Otherwise, set the python error string and
-   * return 0.
-   */
-  int require_contiguous(PyArrayObject* ary)
-  {
-    int contiguous = 1;
-    if (!array_is_contiguous(ary))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must be contiguous.  A non-contiguous array was given");
-      contiguous = 0;
+/* Convert a given PyObject to a Fortran-ordered PyArrayObject of the
+ * specified type.  If the input object is not a Fortran-ordered
+ * PyArrayObject, a new one will be created and the new object flag
+ * will be set.
+ */
+PyArrayObject *obj_to_array_fortran_allow_conversion(PyObject *input,
+                                                     int typecode,
+                                                     int *is_new_object) {
+  int is_new1 = 0;
+  int is_new2 = 0;
+  PyArrayObject *ary2;
+  PyArrayObject *ary1 =
+      obj_to_array_allow_conversion(input, typecode, &is_new1);
+  if (ary1) {
+    ary2 = make_fortran(ary1, &is_new2);
+    if (is_new1 && is_new2) {
+      Py_DECREF(ary1);
     }
-    return contiguous;
+    ary1 = ary2;
   }
+  *is_new_object = is_new1 || is_new2;
+  return ary1;
+}
 
-  /* Test whether a python object is (C_ or F_) contiguous.  If array is
-   * contiguous, return 1.  Otherwise, set the python error string and
-   * return 0.
-   */
-  int require_c_or_f_contiguous(PyArrayObject* ary)
-  {
-    int contiguous = 1;
-    if (!(array_is_contiguous(ary) || array_is_fortran(ary)))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must be contiguous (C_ or F_).  A non-contiguous array was given");
-      contiguous = 0;
-    }
-    return contiguous;
+/* Test whether a python object is contiguous.  If array is
+ * contiguous, return 1.  Otherwise, set the python error string and
+ * return 0.
+ */
+int require_contiguous(PyArrayObject *ary) {
+  int contiguous = 1;
+  if (!array_is_contiguous(ary)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        "Array must be contiguous.  A non-contiguous array was given");
+    contiguous = 0;
   }
+  return contiguous;
+}
 
-  /* Require that a numpy array is not byte-swapped.  If the array is
-   * not byte-swapped, return 1.  Otherwise, set the python error string
-   * and return 0.
-   */
-  int require_native(PyArrayObject* ary)
-  {
-    int native = 1;
-    if (!array_is_native(ary))
-    {
-      PyErr_SetString(PyExc_TypeError,
-                      "Array must have native byteorder.  "
-                      "A byte-swapped array was given");
-      native = 0;
-    }
-    return native;
+/* Test whether a python object is (C_ or F_) contiguous.  If array is
+ * contiguous, return 1.  Otherwise, set the python error string and
+ * return 0.
+ */
+int require_c_or_f_contiguous(PyArrayObject *ary) {
+  int contiguous = 1;
+  if (!(array_is_contiguous(ary) || array_is_fortran(ary))) {
+    PyErr_SetString(PyExc_TypeError, "Array must be contiguous (C_ or F_).  A "
+                                     "non-contiguous array was given");
+    contiguous = 0;
   }
+  return contiguous;
+}
 
-  /* Require the given PyArrayObject to have a specified number of
-   * dimensions.  If the array has the specified number of dimensions,
-   * return 1.  Otherwise, set the python error string and return 0.
-   */
-  int require_dimensions(PyArrayObject* ary,
-                         int            exact_dimensions)
-  {
-    int success = 1;
-    if (array_numdims(ary) != exact_dimensions)
-    {
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have %d dimensions.  Given array has %d dimensions",
-                   exact_dimensions,
-                   array_numdims(ary));
-      success = 0;
-    }
-    return success;
+/* Require that a numpy array is not byte-swapped.  If the array is
+ * not byte-swapped, return 1.  Otherwise, set the python error string
+ * and return 0.
+ */
+int require_native(PyArrayObject *ary) {
+  int native = 1;
+  if (!array_is_native(ary)) {
+    PyErr_SetString(PyExc_TypeError, "Array must have native byteorder.  "
+                                     "A byte-swapped array was given");
+    native = 0;
   }
+  return native;
+}
 
-  /* Require the given PyArrayObject to have one of a list of specified
-   * number of dimensions.  If the array has one of the specified number
-   * of dimensions, return 1.  Otherwise, set the python error string
-   * and return 0.
-   */
-  int require_dimensions_n(PyArrayObject* ary,
-                           int*           exact_dimensions,
-                           int            n)
-  {
-    int success = 0;
-    int i;
-    char dims_str[255] = "";
-    char s[255];
-    for (i = 0; i < n && !success; i++)
-    {
-      if (array_numdims(ary) == exact_dimensions[i])
-      {
-        success = 1;
-      }
+/* Require the given PyArrayObject to have a specified number of
+ * dimensions.  If the array has the specified number of dimensions,
+ * return 1.  Otherwise, set the python error string and return 0.
+ */
+int require_dimensions(PyArrayObject *ary, int exact_dimensions) {
+  int success = 1;
+  if (array_numdims(ary) != exact_dimensions) {
+    PyErr_Format(
+        PyExc_TypeError,
+        "Array must have %d dimensions.  Given array has %d dimensions",
+        exact_dimensions, array_numdims(ary));
+    success = 0;
+  }
+  return success;
+}
+
+/* Require the given PyArrayObject to have one of a list of specified
+ * number of dimensions.  If the array has one of the specified number
+ * of dimensions, return 1.  Otherwise, set the python error string
+ * and return 0.
+ */
+int require_dimensions_n(PyArrayObject *ary, int *exact_dimensions, int n) {
+  int success = 0;
+  int i;
+  char dims_str[255] = "";
+  char s[255];
+  for (i = 0; i < n && !success; i++) {
+    if (array_numdims(ary) == exact_dimensions[i]) {
+      success = 1;
     }
-    if (!success)
-    {
-      for (i = 0; i < n-1; i++)
-      {
-        sprintf(s, "%d, ", exact_dimensions[i]);
-        strcat(dims_str,s);
-      }
-      sprintf(s, " or %d", exact_dimensions[n-1]);
-      strcat(dims_str,s);
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have %s dimensions.  Given array has %d dimensions",
-                   dims_str,
-                   array_numdims(ary));
+  }
+  if (!success) {
+    for (i = 0; i < n - 1; i++) {
+      sprintf(s, "%d, ", exact_dimensions[i]);
+      strcat(dims_str, s);
     }
-    return success;
+    sprintf(s, " or %d", exact_dimensions[n - 1]);
+    strcat(dims_str, s);
+    PyErr_Format(
+        PyExc_TypeError,
+        "Array must have %s dimensions.  Given array has %d dimensions",
+        dims_str, array_numdims(ary));
   }
+  return success;
+}
 
-  /* Require the given PyArrayObject to have a specified shape.  If the
-   * array has the specified shape, return 1.  Otherwise, set the python
-   * error string and return 0.
-   */
-  int require_size(PyArrayObject* ary,
-                   npy_intp*      size,
-                   int            n)
-  {
-    int i;
-    int success = 1;
-    size_t len;
-    char desired_dims[255] = "[";
-    char s[255];
-    char actual_dims[255] = "[";
-    for(i=0; i < n;i++)
-    {
-      if (size[i] != -1 &&  size[i] != array_size(ary,i))
-      {
-        success = 0;
-      }
+/* Require the given PyArrayObject to have a specified shape.  If the
+ * array has the specified shape, return 1.  Otherwise, set the python
+ * error string and return 0.
+ */
+int require_size(PyArrayObject *ary, npy_intp *size, int n) {
+  int i;
+  int success = 1;
+  size_t len;
+  char desired_dims[255] = "[";
+  char s[255];
+  char actual_dims[255] = "[";
+  for (i = 0; i < n; i++) {
+    if (size[i] != -1 && size[i] != array_size(ary, i)) {
+      success = 0;
     }
-    if (!success)
-    {
-      for (i = 0; i < n; i++)
-      {
-        if (size[i] == -1)
-        {
-          sprintf(s, "*,");
-        }
-        else
-        {
-          sprintf(s, "%ld,", (long int)size[i]);
-        }
-        strcat(desired_dims,s);
-      }
-      len = strlen(desired_dims);
-      desired_dims[len-1] = ']';
-      for (i = 0; i < n; i++)
-      {
-        sprintf(s, "%ld,", (long int)array_size(ary,i));
-        strcat(actual_dims,s);
+  }
+  if (!success) {
+    for (i = 0; i < n; i++) {
+      if (size[i] == -1) {
+        sprintf(s, "*,");
+      } else {
+        sprintf(s, "%ld,", (long int)size[i]);
       }
-      len = strlen(actual_dims);
-      actual_dims[len-1] = ']';
-      PyErr_Format(PyExc_TypeError,
-                   "Array must have shape of %s.  Given array has shape of %s",
-                   desired_dims,
-                   actual_dims);
+      strcat(desired_dims, s);
     }
-    return success;
+    len = strlen(desired_dims);
+    desired_dims[len - 1] = ']';
+    for (i = 0; i < n; i++) {
+      sprintf(s, "%ld,", (long int)array_size(ary, i));
+      strcat(actual_dims, s);
+    }
+    len = strlen(actual_dims);
+    actual_dims[len - 1] = ']';
+    PyErr_Format(PyExc_TypeError,
+                 "Array must have shape of %s.  Given array has shape of %s",
+                 desired_dims, actual_dims);
   }
+  return success;
+}
 
-  /* Require the given PyArrayObject to to be Fortran ordered.  If the
-   * the PyArrayObject is already Fortran ordered, do nothing.  Else,
-   * set the Fortran ordering flag and recompute the strides.
-   */
-  int require_fortran(PyArrayObject* ary)
-  {
-    int success = 1;
-    int nd = array_numdims(ary);
-    int i;
-    npy_intp * strides = array_strides(ary);
-    if (array_is_fortran(ary)) return success;
-    int n_non_one = 0;
-    /* Set the Fortran ordered flag */
-    const npy_intp *dims = array_dimensions(ary);
-    for (i=0; i < nd; ++i)
-      n_non_one += (dims[i] != 1) ? 1 : 0;
-    if (n_non_one > 1)    
-      array_clearflags(ary,NPY_ARRAY_CARRAY);
-    array_enableflags(ary,NPY_ARRAY_FARRAY);
-    /* Recompute the strides */
-    strides[0] = strides[nd-1];
-    for (i=1; i < nd; ++i)
-      strides[i] = strides[i-1] * array_size(ary,i-1);
+/* Require the given PyArrayObject to to be Fortran ordered.  If the
+ * the PyArrayObject is already Fortran ordered, do nothing.  Else,
+ * set the Fortran ordering flag and recompute the strides.
+ */
+int require_fortran(PyArrayObject *ary) {
+  int success = 1;
+  int nd = array_numdims(ary);
+  int i;
+  npy_intp *strides = array_strides(ary);
+  if (array_is_fortran(ary))
     return success;
-  }
-
-
-
+  int n_non_one = 0;
+  /* Set the Fortran ordered flag */
+  const npy_intp *dims = array_dimensions(ary);
+  for (i = 0; i < nd; ++i)
+    n_non_one += (dims[i] != 1) ? 1 : 0;
+  if (n_non_one > 1)
+    array_clearflags(ary, NPY_ARRAY_CARRAY);
+  array_enableflags(ary, NPY_ARRAY_FARRAY);
+  /* Recompute the strides */
+  strides[0] = strides[nd - 1];
+  for (i = 1; i < nd; ++i)
+    strides[i] = strides[i - 1] * array_size(ary, i - 1);
+  return success;
+}
 
 #include <limits.h>
 #if !defined(SWIG_NO_LLONG_MAX)
-# if !defined(LLONG_MAX) && defined(__GNUC__) && defined (__LONG_LONG_MAX__)
-#   define LLONG_MAX __LONG_LONG_MAX__
-#   define LLONG_MIN (-LLONG_MAX - 1LL)
-#   define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL)
-# endif
+#if !defined(LLONG_MAX) && defined(__GNUC__) && defined(__LONG_LONG_MAX__)
+#define LLONG_MAX __LONG_LONG_MAX__
+#define LLONG_MIN (-LLONG_MAX - 1LL)
+#define ULLONG_MAX (LLONG_MAX * 2ULL + 1ULL)
+#endif
 #endif
 
-
-SWIGINTERN int
-SWIG_AsVal_double (PyObject *obj, double *val)
-{
+SWIGINTERN int SWIG_AsVal_double(PyObject *obj, double *val) {
   int res = SWIG_TypeError;
   if (PyFloat_Check(obj)) {
-    if (val) *val = PyFloat_AsDouble(obj);
+    if (val)
+      *val = PyFloat_AsDouble(obj);
     return SWIG_OK;
 #if PY_VERSION_HEX < 0x03000000
   } else if (PyInt_Check(obj)) {
-    if (val) *val = PyInt_AsLong(obj);
+    if (val)
+      *val = PyInt_AsLong(obj);
     return SWIG_OK;
 #endif
   } else if (PyLong_Check(obj)) {
     double v = PyLong_AsDouble(obj);
     if (!PyErr_Occurred()) {
-      if (val) *val = v;
+      if (val)
+        *val = v;
       return SWIG_OK;
     } else {
       PyErr_Clear();
@@ -3552,7 +3573,8 @@ SWIG_AsVal_double (PyObject *obj, double *val)
     int dispatch = 0;
     double d = PyFloat_AsDouble(obj);
     if (!PyErr_Occurred()) {
-      if (val) *val = d;
+      if (val)
+        *val = d;
       return SWIG_AddCast(SWIG_OK);
     } else {
       PyErr_Clear();
@@ -3560,10 +3582,11 @@ SWIG_AsVal_double (PyObject *obj, double *val)
     if (!dispatch) {
       long v = PyLong_AsLong(obj);
       if (!PyErr_Occurred()) {
-	if (val) *val = v;
-	return SWIG_AddCast(SWIG_AddCast(SWIG_OK));
+        if (val)
+          *val = v;
+        return SWIG_AddCast(SWIG_AddCast(SWIG_OK));
       } else {
-	PyErr_Clear();
+        PyErr_Clear();
       }
     }
   }
@@ -3571,56 +3594,51 @@ SWIG_AsVal_double (PyObject *obj, double *val)
   return res;
 }
 
-
 #include <float.h>
 
-
 #include <math.h>
 
-
-SWIGINTERNINLINE int
-SWIG_CanCastAsInteger(double *d, double min, double max) {
+SWIGINTERNINLINE int SWIG_CanCastAsInteger(double *d, double min, double max) {
   double x = *d;
   if ((min <= x && x <= max)) {
-   double fx = floor(x);
-   double cx = ceil(x);
-   double rd =  ((x - fx) < 0.5) ? fx : cx; /* simple rint */
-   if ((errno == EDOM) || (errno == ERANGE)) {
-     errno = 0;
-   } else {
-     double summ, reps, diff;
-     if (rd < x) {
-       diff = x - rd;
-     } else if (rd > x) {
-       diff = rd - x;
-     } else {
-       return 1;
-     }
-     summ = rd + x;
-     reps = diff/summ;
-     if (reps < 8*DBL_EPSILON) {
-       *d = rd;
-       return 1;
-     }
-   }
+    double fx = floor(x);
+    double cx = ceil(x);
+    double rd = ((x - fx) < 0.5) ? fx : cx; /* simple rint */
+    if ((errno == EDOM) || (errno == ERANGE)) {
+      errno = 0;
+    } else {
+      double summ, reps, diff;
+      if (rd < x) {
+        diff = x - rd;
+      } else if (rd > x) {
+        diff = rd - x;
+      } else {
+        return 1;
+      }
+      summ = rd + x;
+      reps = diff / summ;
+      if (reps < 8 * DBL_EPSILON) {
+        *d = rd;
+        return 1;
+      }
+    }
   }
   return 0;
 }
 
-
-SWIGINTERN int
-SWIG_AsVal_long (PyObject *obj, long* val)
-{
+SWIGINTERN int SWIG_AsVal_long(PyObject *obj, long *val) {
 #if PY_VERSION_HEX < 0x03000000
   if (PyInt_Check(obj)) {
-    if (val) *val = PyInt_AsLong(obj);
+    if (val)
+      *val = PyInt_AsLong(obj);
     return SWIG_OK;
   } else
 #endif
-  if (PyLong_Check(obj)) {
+      if (PyLong_Check(obj)) {
     long v = PyLong_AsLong(obj);
     if (!PyErr_Occurred()) {
-      if (val) *val = v;
+      if (val)
+        *val = v;
       return SWIG_OK;
     } else {
       PyErr_Clear();
@@ -3632,17 +3650,19 @@ SWIG_AsVal_long (PyObject *obj, long* val)
     int dispatch = 0;
     long v = PyInt_AsLong(obj);
     if (!PyErr_Occurred()) {
-      if (val) *val = v;
+      if (val)
+        *val = v;
       return SWIG_AddCast(SWIG_OK);
     } else {
       PyErr_Clear();
     }
     if (!dispatch) {
       double d;
-      int res = SWIG_AddCast(SWIG_AsVal_double (obj,&d));
+      int res = SWIG_AddCast(SWIG_AsVal_double(obj, &d));
       if (SWIG_IsOK(res) && SWIG_CanCastAsInteger(&d, LONG_MIN, LONG_MAX)) {
-	if (val) *val = (long)(d);
-	return res;
+        if (val)
+          *val = (long)(d);
+        return res;
       }
     }
   }
@@ -3650,291 +3670,334 @@ SWIG_AsVal_long (PyObject *obj, long* val)
   return SWIG_TypeError;
 }
 
-
-SWIGINTERN int
-SWIG_AsVal_int (PyObject * obj, int *val)
-{
+SWIGINTERN int SWIG_AsVal_int(PyObject *obj, int *val) {
   long v;
-  int res = SWIG_AsVal_long (obj, &v);
+  int res = SWIG_AsVal_long(obj, &v);
   if (SWIG_IsOK(res)) {
     if ((v < INT_MIN || v > INT_MAX)) {
       return SWIG_OverflowError;
     } else {
-      if (val) *val = (int)(v);
+      if (val)
+        *val = (int)(v);
     }
-  }  
+  }
   return res;
 }
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-SWIGINTERN PyObject *_wrap_gamut_map__SWIG_0(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
+SWIGINTERN PyObject *_wrap_gamut_map__SWIG_0(PyObject *SWIGUNUSEDPARM(self),
+                                             PyObject *args) {
   PyObject *resultobj = 0;
-  float *arg1 = (float *) 0 ;
-  int arg2 ;
-  int arg3 ;
-  int arg4 ;
-  float *arg5 = (float *) 0 ;
-  int arg6 ;
-  int arg7 ;
-  int arg8 ;
-  float *arg9 = (float *) 0 ;
-  int arg10 ;
-  int arg11 ;
-  float *arg12 = (float *) 0 ;
-  int arg13 ;
-  int arg14 ;
-  float *arg15 = (float *) 0 ;
-  int arg16 ;
-  int arg17 ;
-  PyArrayObject *array1 = NULL ;
-  int is_new_object1 = 0 ;
-  PyArrayObject *array5 = NULL ;
-  PyArrayObject *array9 = NULL ;
-  int is_new_object9 = 0 ;
-  PyArrayObject *array12 = NULL ;
-  int is_new_object12 = 0 ;
-  PyArrayObject *array15 = NULL ;
-  int is_new_object15 = 0 ;
-  PyObject * obj0 = 0 ;
-  PyObject * obj1 = 0 ;
-  PyObject * obj2 = 0 ;
-  PyObject * obj3 = 0 ;
-  PyObject * obj4 = 0 ;
-  
-  if (!PyArg_ParseTuple(args,(char *)"OOOOO:gamut_map",&obj0,&obj1,&obj2,&obj3,&obj4)) SWIG_fail;
+  float *arg1 = (float *)0;
+  int arg2;
+  int arg3;
+  int arg4;
+  float *arg5 = (float *)0;
+  int arg6;
+  int arg7;
+  int arg8;
+  float *arg9 = (float *)0;
+  int arg10;
+  int arg11;
+  float *arg12 = (float *)0;
+  int arg13;
+  int arg14;
+  float *arg15 = (float *)0;
+  int arg16;
+  int arg17;
+  PyArrayObject *array1 = NULL;
+  int is_new_object1 = 0;
+  PyArrayObject *array5 = NULL;
+  PyArrayObject *array9 = NULL;
+  int is_new_object9 = 0;
+  PyArrayObject *array12 = NULL;
+  int is_new_object12 = 0;
+  PyArrayObject *array15 = NULL;
+  int is_new_object15 = 0;
+  PyObject *obj0 = 0;
+  PyObject *obj1 = 0;
+  PyObject *obj2 = 0;
+  PyObject *obj3 = 0;
+  PyObject *obj4 = 0;
+
+  if (!PyArg_ParseTuple(args, (char *)"OOOOO:gamut_map", &obj0, &obj1, &obj2,
+                        &obj3, &obj4))
+    SWIG_fail;
   {
-    npy_intp size[3] = {
-      -1, -1, -1 
-    };
+    npy_intp size[3] = {-1, -1, -1};
     array1 = obj_to_array_contiguous_allow_conversion(obj0, NPY_FLOAT,
-      &is_new_object1);
+                                                      &is_new_object1);
     if (!array1 || !require_dimensions(array1, 3) ||
-      !require_size(array1, size, 3)) SWIG_fail;
-    arg1 = (float*) array_data(array1);
-    arg2 = (int) array_size(array1,0);
-    arg3 = (int) array_size(array1,1);
-    arg4 = (int) array_size(array1,2);
+        !require_size(array1, size, 3))
+      SWIG_fail;
+    arg1 = (float *)array_data(array1);
+    arg2 = (int)array_size(array1, 0);
+    arg3 = (int)array_size(array1, 1);
+    arg4 = (int)array_size(array1, 2);
   }
   {
     array5 = obj_to_array_no_conversion(obj1, NPY_FLOAT);
-    if (!array5 || !require_dimensions(array5,3) || !require_contiguous(array5) ||
-      !require_native(array5)) SWIG_fail;
-    arg5 = (float*) array_data(array5);
-    arg6 = (int) array_size(array5,0);
-    arg7 = (int) array_size(array5,1);
-    arg8 = (int) array_size(array5,2);
+    if (!array5 || !require_dimensions(array5, 3) ||
+        !require_contiguous(array5) || !require_native(array5))
+      SWIG_fail;
+    arg5 = (float *)array_data(array5);
+    arg6 = (int)array_size(array5, 0);
+    arg7 = (int)array_size(array5, 1);
+    arg8 = (int)array_size(array5, 2);
   }
   {
-    npy_intp size[2] = {
-      -1, -1 
-    };
+    npy_intp size[2] = {-1, -1};
     array9 = obj_to_array_contiguous_allow_conversion(obj2, NPY_FLOAT,
-      &is_new_object9);
+                                                      &is_new_object9);
     if (!array9 || !require_dimensions(array9, 2) ||
-      !require_size(array9, size, 2)) SWIG_fail;
-    arg9 = (float*) array_data(array9);
-    arg10 = (int) array_size(array9,0);
-    arg11 = (int) array_size(array9,1);
+        !require_size(array9, size, 2))
+      SWIG_fail;
+    arg9 = (float *)array_data(array9);
+    arg10 = (int)array_size(array9, 0);
+    arg11 = (int)array_size(array9, 1);
   }
   {
-    npy_intp size[2] = {
-      -1, -1 
-    };
+    npy_intp size[2] = {-1, -1};
     array12 = obj_to_array_contiguous_allow_conversion(obj3, NPY_FLOAT,
-      &is_new_object12);
+                                                       &is_new_object12);
     if (!array12 || !require_dimensions(array12, 2) ||
-      !require_size(array12, size, 2)) SWIG_fail;
-    arg12 = (float*) array_data(array12);
-    arg13 = (int) array_size(array12,0);
-    arg14 = (int) array_size(array12,1);
+        !require_size(array12, size, 2))
+      SWIG_fail;
+    arg12 = (float *)array_data(array12);
+    arg13 = (int)array_size(array12, 0);
+    arg14 = (int)array_size(array12, 1);
   }
   {
-    npy_intp size[2] = {
-      -1, -1 
-    };
+    npy_intp size[2] = {-1, -1};
     array15 = obj_to_array_contiguous_allow_conversion(obj4, NPY_FLOAT,
-      &is_new_object15);
+                                                       &is_new_object15);
     if (!array15 || !require_dimensions(array15, 2) ||
-      !require_size(array15, size, 2)) SWIG_fail;
-    arg15 = (float*) array_data(array15);
-    arg16 = (int) array_size(array15,0);
-    arg17 = (int) array_size(array15,1);
+        !require_size(array15, size, 2))
+      SWIG_fail;
+    arg15 = (float *)array_data(array15);
+    arg16 = (int)array_size(array15, 0);
+    arg17 = (int)array_size(array15, 1);
   }
   {
-    gamut_map_full(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12,arg13,arg14,arg15,arg16,arg17);
-    if (PyErr_Occurred()) SWIG_fail;
+    gamut_map_full(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10,
+                   arg11, arg12, arg13, arg14, arg15, arg16, arg17);
+    if (PyErr_Occurred())
+      SWIG_fail;
   }
   resultobj = SWIG_Py_Void();
   {
-    if (is_new_object1 && array1)
-    {
-      Py_DECREF(array1); 
+    if (is_new_object1 && array1) {
+      Py_DECREF(array1);
     }
   }
   {
-    if (is_new_object9 && array9)
-    {
-      Py_DECREF(array9); 
+    if (is_new_object9 && array9) {
+      Py_DECREF(array9);
     }
   }
   {
-    if (is_new_object12 && array12)
-    {
-      Py_DECREF(array12); 
+    if (is_new_object12 && array12) {
+      Py_DECREF(array12);
     }
   }
   {
-    if (is_new_object15 && array15)
-    {
-      Py_DECREF(array15); 
+    if (is_new_object15 && array15) {
+      Py_DECREF(array15);
     }
   }
   return resultobj;
-fail:
-  {
-    if (is_new_object1 && array1)
-    {
-      Py_DECREF(array1); 
-    }
+fail : {
+  if (is_new_object1 && array1) {
+    Py_DECREF(array1);
   }
+}
   {
-    if (is_new_object9 && array9)
-    {
-      Py_DECREF(array9); 
+    if (is_new_object9 && array9) {
+      Py_DECREF(array9);
     }
   }
   {
-    if (is_new_object12 && array12)
-    {
-      Py_DECREF(array12); 
+    if (is_new_object12 && array12) {
+      Py_DECREF(array12);
     }
   }
   {
-    if (is_new_object15 && array15)
-    {
-      Py_DECREF(array15); 
+    if (is_new_object15 && array15) {
+      Py_DECREF(array15);
     }
   }
   return NULL;
 }
 
-
-SWIGINTERN PyObject *_wrap_gamut_map__SWIG_1(PyObject *SWIGUNUSEDPARM(self), PyObject *args) {
+SWIGINTERN PyObject *_wrap_gamut_map__SWIG_1(PyObject *SWIGUNUSEDPARM(self),
+                                             PyObject *args) {
   PyObject *resultobj = 0;
-  float *arg1 = (float *) 0 ;
-  int arg2 ;
-  int arg3 ;
-  int arg4 ;
-  float *arg5 = (float *) 0 ;
-  float *arg6 = (float *) 0 ;
-  float *arg7 = (float *) 0 ;
-  float *arg8 = (float *) 0 ;
-  int arg9 ;
-  void *argp1 = 0 ;
-  int res1 = 0 ;
-  int val2 ;
-  int ecode2 = 0 ;
-  int val3 ;
-  int ecode3 = 0 ;
-  int val4 ;
-  int ecode4 = 0 ;
-  void *argp5 = 0 ;
-  int res5 = 0 ;
-  void *argp6 = 0 ;
-  int res6 = 0 ;
-  void *argp7 = 0 ;
-  int res7 = 0 ;
-  void *argp8 = 0 ;
-  int res8 = 0 ;
-  int val9 ;
-  int ecode9 = 0 ;
-  PyObject * obj0 = 0 ;
-  PyObject * obj1 = 0 ;
-  PyObject * obj2 = 0 ;
-  PyObject * obj3 = 0 ;
-  PyObject * obj4 = 0 ;
-  PyObject * obj5 = 0 ;
-  PyObject * obj6 = 0 ;
-  PyObject * obj7 = 0 ;
-  PyObject * obj8 = 0 ;
-  
-  if (!PyArg_ParseTuple(args,(char *)"OOOOOOOOO:gamut_map",&obj0,&obj1,&obj2,&obj3,&obj4,&obj5,&obj6,&obj7,&obj8)) SWIG_fail;
-  res1 = SWIG_ConvertPtr(obj0, &argp1,SWIGTYPE_p_float, 0 |  0 );
+  float *arg1 = (float *)0;
+  int arg2;
+  int arg3;
+  int arg4;
+  float *arg5 = (float *)0;
+  float *arg6 = (float *)0;
+  float *arg7 = (float *)0;
+  float *arg8 = (float *)0;
+  int arg9;
+  void *argp1 = 0;
+  int res1 = 0;
+  int val2;
+  int ecode2 = 0;
+  int val3;
+  int ecode3 = 0;
+  int val4;
+  int ecode4 = 0;
+  void *argp5 = 0;
+  int res5 = 0;
+  void *argp6 = 0;
+  int res6 = 0;
+  void *argp7 = 0;
+  int res7 = 0;
+  void *argp8 = 0;
+  int res8 = 0;
+  int val9;
+  int ecode9 = 0;
+  PyObject *obj0 = 0;
+  PyObject *obj1 = 0;
+  PyObject *obj2 = 0;
+  PyObject *obj3 = 0;
+  PyObject *obj4 = 0;
+  PyObject *obj5 = 0;
+  PyObject *obj6 = 0;
+  PyObject *obj7 = 0;
+  PyObject *obj8 = 0;
+
+  if (!PyArg_ParseTuple(args, (char *)"OOOOOOOOO:gamut_map", &obj0, &obj1,
+                        &obj2, &obj3, &obj4, &obj5, &obj6, &obj7, &obj8))
+    SWIG_fail;
+  res1 = SWIG_ConvertPtr(obj0, &argp1, SWIGTYPE_p_float, 0 | 0);
   if (!SWIG_IsOK(res1)) {
-    SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "gamut_map" "', argument " "1"" of type '" "float *""'"); 
+    SWIG_exception_fail(SWIG_ArgError(res1), "in method '"
+                                             "gamut_map"
+                                             "', argument "
+                                             "1"
+                                             " of type '"
+                                             "float *"
+                                             "'");
   }
   arg1 = (float *)(argp1);
   ecode2 = SWIG_AsVal_int(obj1, &val2);
   if (!SWIG_IsOK(ecode2)) {
-    SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "gamut_map" "', argument " "2"" of type '" "int""'");
-  } 
+    SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '"
+                                               "gamut_map"
+                                               "', argument "
+                                               "2"
+                                               " of type '"
+                                               "int"
+                                               "'");
+  }
   arg2 = (int)(val2);
   ecode3 = SWIG_AsVal_int(obj2, &val3);
   if (!SWIG_IsOK(ecode3)) {
-    SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '" "gamut_map" "', argument " "3"" of type '" "int""'");
-  } 
+    SWIG_exception_fail(SWIG_ArgError(ecode3), "in method '"
+                                               "gamut_map"
+                                               "', argument "
+                                               "3"
+                                               " of type '"
+                                               "int"
+                                               "'");
+  }
   arg3 = (int)(val3);
   ecode4 = SWIG_AsVal_int(obj3, &val4);
   if (!SWIG_IsOK(ecode4)) {
-    SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '" "gamut_map" "', argument " "4"" of type '" "int""'");
-  } 
+    SWIG_exception_fail(SWIG_ArgError(ecode4), "in method '"
+                                               "gamut_map"
+                                               "', argument "
+                                               "4"
+                                               " of type '"
+                                               "int"
+                                               "'");
+  }
   arg4 = (int)(val4);
-  res5 = SWIG_ConvertPtr(obj4, &argp5,SWIGTYPE_p_float, 0 |  0 );
+  res5 = SWIG_ConvertPtr(obj4, &argp5, SWIGTYPE_p_float, 0 | 0);
   if (!SWIG_IsOK(res5)) {
-    SWIG_exception_fail(SWIG_ArgError(res5), "in method '" "gamut_map" "', argument " "5"" of type '" "float *""'"); 
+    SWIG_exception_fail(SWIG_ArgError(res5), "in method '"
+                                             "gamut_map"
+                                             "', argument "
+                                             "5"
+                                             " of type '"
+                                             "float *"
+                                             "'");
   }
   arg5 = (float *)(argp5);
-  res6 = SWIG_ConvertPtr(obj5, &argp6,SWIGTYPE_p_float, 0 |  0 );
+  res6 = SWIG_ConvertPtr(obj5, &argp6, SWIGTYPE_p_float, 0 | 0);
   if (!SWIG_IsOK(res6)) {
-    SWIG_exception_fail(SWIG_ArgError(res6), "in method '" "gamut_map" "', argument " "6"" of type '" "float *""'"); 
+    SWIG_exception_fail(SWIG_ArgError(res6), "in method '"
+                                             "gamut_map"
+                                             "', argument "
+                                             "6"
+                                             " of type '"
+                                             "float *"
+                                             "'");
   }
   arg6 = (float *)(argp6);
-  res7 = SWIG_ConvertPtr(obj6, &argp7,SWIGTYPE_p_float, 0 |  0 );
+  res7 = SWIG_ConvertPtr(obj6, &argp7, SWIGTYPE_p_float, 0 | 0);
   if (!SWIG_IsOK(res7)) {
-    SWIG_exception_fail(SWIG_ArgError(res7), "in method '" "gamut_map" "', argument " "7"" of type '" "float *""'"); 
+    SWIG_exception_fail(SWIG_ArgError(res7), "in method '"
+                                             "gamut_map"
+                                             "', argument "
+                                             "7"
+                                             " of type '"
+                                             "float *"
+                                             "'");
   }
   arg7 = (float *)(argp7);
-  res8 = SWIG_ConvertPtr(obj7, &argp8,SWIGTYPE_p_float, 0 |  0 );
+  res8 = SWIG_ConvertPtr(obj7, &argp8, SWIGTYPE_p_float, 0 | 0);
   if (!SWIG_IsOK(res8)) {
-    SWIG_exception_fail(SWIG_ArgError(res8), "in method '" "gamut_map" "', argument " "8"" of type '" "float *""'"); 
+    SWIG_exception_fail(SWIG_ArgError(res8), "in method '"
+                                             "gamut_map"
+                                             "', argument "
+                                             "8"
+                                             " of type '"
+                                             "float *"
+                                             "'");
   }
   arg8 = (float *)(argp8);
   ecode9 = SWIG_AsVal_int(obj8, &val9);
   if (!SWIG_IsOK(ecode9)) {
-    SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '" "gamut_map" "', argument " "9"" of type '" "int""'");
-  } 
+    SWIG_exception_fail(SWIG_ArgError(ecode9), "in method '"
+                                               "gamut_map"
+                                               "', argument "
+                                               "9"
+                                               " of type '"
+                                               "int"
+                                               "'");
+  }
   arg9 = (int)(val9);
-  gamut_map(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9);
+  gamut_map(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9);
   resultobj = SWIG_Py_Void();
   return resultobj;
 fail:
   return NULL;
 }
 
-
 SWIGINTERN PyObject *_wrap_gamut_map(PyObject *self, PyObject *args) {
   Py_ssize_t argc;
-  PyObject *argv[10] = {
-    0
-  };
+  PyObject *argv[10] = {0};
   Py_ssize_t ii;
-  
-  if (!PyTuple_Check(args)) SWIG_fail;
+
+  if (!PyTuple_Check(args))
+    SWIG_fail;
   argc = args ? PyObject_Length(args) : 0;
   for (ii = 0; (ii < 9) && (ii < argc); ii++) {
-    argv[ii] = PyTuple_GET_ITEM(args,ii);
+    argv[ii] = PyTuple_GET_ITEM(args, ii);
   }
   if (argc == 5) {
     int _v;
-    {
-      _v = is_array(argv[0]) || PySequence_Check(argv[0]);
-    }
+    { _v = is_array(argv[0]) || PySequence_Check(argv[0]); }
     if (_v) {
       {
-        _v = is_array(argv[1]) && PyArray_EquivTypenums(array_type(argv[1]),
-          NPY_FLOAT);
+        _v = is_array(argv[1]) &&
+             PyArray_EquivTypenums(array_type(argv[1]), NPY_FLOAT);
       }
       if (_v) {
         {
@@ -3996,7 +4059,8 @@ SWIGINTERN PyObject *_wrap_gamut_map(PyObject *self, PyObject *args) {
                 _v = SWIG_CheckState(res);
                 if (_v) {
                   void *vptr = 0;
-                  int res = SWIG_ConvertPtr(argv[7], &vptr, SWIGTYPE_p_float, 0);
+                  int res =
+                      SWIG_ConvertPtr(argv[7], &vptr, SWIGTYPE_p_float, 0);
                   _v = SWIG_CheckState(res);
                   if (_v) {
                     {
@@ -4015,46 +4079,50 @@ SWIGINTERN PyObject *_wrap_gamut_map(PyObject *self, PyObject *args) {
       }
     }
   }
-  
+
 fail:
-  SWIG_SetErrorMsg(PyExc_NotImplementedError,"Wrong number or type of arguments for overloaded function 'gamut_map'.\n"
-    "  Possible C/C++ prototypes are:\n"
-    "    gamut_map_full(float *,int,int,int,float *,int,int,int,float *,int,int,float *,int,int,float *,int,int)\n"
-    "    gamut_map(float *,int,int,int,float *,float *,float *,float *,int)\n");
+  SWIG_SetErrorMsg(
+      PyExc_NotImplementedError,
+      "Wrong number or type of arguments for overloaded function 'gamut_map'.\n"
+      "  Possible C/C++ prototypes are:\n"
+      "    gamut_map_full(float *,int,int,int,float *,int,int,int,float "
+      "*,int,int,float *,int,int,float *,int,int)\n"
+      "    gamut_map(float *,int,int,int,float *,float *,float *,float "
+      "*,int)\n");
   return 0;
 }
 
-
 static PyMethodDef SwigMethods[] = {
-	 { (char *)"SWIG_PyInstanceMethod_New", (PyCFunction)SWIG_PyInstanceMethod_New, METH_O, NULL},
-	 { (char *)"gamut_map", _wrap_gamut_map, METH_VARARGS, NULL},
-	 { NULL, NULL, 0, NULL }
-};
-
+    {(char *)"SWIG_PyInstanceMethod_New",
+     (PyCFunction)SWIG_PyInstanceMethod_New, METH_O, NULL},
+    {(char *)"gamut_map", _wrap_gamut_map, METH_VARARGS, NULL},
+    {NULL, NULL, 0, NULL}};
 
 /* -------- TYPE CONVERSION AND EQUIVALENCE RULES (BEGIN) -------- */
 
-static swig_type_info _swigt__p_char = {"_p_char", "char *", 0, 0, (void*)0, 0};
-static swig_type_info _swigt__p_float = {"_p_float", "float *", 0, 0, (void*)0, 0};
+static swig_type_info _swigt__p_char = {"_p_char", "char *",  0,
+                                        0,         (void *)0, 0};
+static swig_type_info _swigt__p_float = {"_p_float", "float *", 0,
+                                         0,          (void *)0, 0};
 
 static swig_type_info *swig_type_initial[] = {
-  &_swigt__p_char,
-  &_swigt__p_float,
+    &_swigt__p_char,
+    &_swigt__p_float,
 };
 
-static swig_cast_info _swigc__p_char[] = {  {&_swigt__p_char, 0, 0, 0},{0, 0, 0, 0}};
-static swig_cast_info _swigc__p_float[] = {  {&_swigt__p_float, 0, 0, 0},{0, 0, 0, 0}};
+static swig_cast_info _swigc__p_char[] = {{&_swigt__p_char, 0, 0, 0},
+                                          {0, 0, 0, 0}};
+static swig_cast_info _swigc__p_float[] = {{&_swigt__p_float, 0, 0, 0},
+                                           {0, 0, 0, 0}};
 
 static swig_cast_info *swig_cast_initial[] = {
-  _swigc__p_char,
-  _swigc__p_float,
+    _swigc__p_char,
+    _swigc__p_float,
 };
 
-
 /* -------- TYPE CONVERSION AND EQUIVALENCE RULES (END) -------- */
 
-static swig_const_info swig_const_table[] = {
-{0, 0, 0, 0.0, 0, 0}};
+static swig_const_info swig_const_table[] = {{0, 0, 0, 0.0, 0, 0}};
 
 #ifdef __cplusplus
 }
@@ -4098,7 +4166,8 @@ static swig_const_info swig_const_table[] = {
  *  3) Finally, if cast->type has not already been loaded, then we add that
  *     swig_cast_info to the linked list (because the cast->type) pointer will
  *     be correct.
- * ----------------------------------------------------------------------------- */
+ * -----------------------------------------------------------------------------
+ */
 
 #ifdef __cplusplus
 extern "C" {
@@ -4111,15 +4180,13 @@ extern "C" {
 #define SWIGRUNTIME_DEBUG
 #endif
 
-
-SWIGRUNTIME void
-SWIG_InitializeModule(void *clientdata) {
+SWIGRUNTIME void SWIG_InitializeModule(void *clientdata) {
   size_t i;
   swig_module_info *module_head, *iter;
   int init;
-  
+
   /* check to see if the circular list has been setup, if not, set it up */
-  if (swig_module.next==0) {
+  if (swig_module.next == 0) {
     /* Initialize the swig_module */
     swig_module.type_initial = swig_type_initial;
     swig_module.cast_initial = swig_cast_initial;
@@ -4128,7 +4195,7 @@ SWIG_InitializeModule(void *clientdata) {
   } else {
     init = 0;
   }
-  
+
   /* Try and load any already created modules */
   module_head = SWIG_GetModule(clientdata);
   if (!module_head) {
@@ -4137,27 +4204,28 @@ SWIG_InitializeModule(void *clientdata) {
     SWIG_SetModule(clientdata, &swig_module);
   } else {
     /* the interpreter has loaded a SWIG module, but has it loaded this one? */
-    iter=module_head;
+    iter = module_head;
     do {
-      if (iter==&swig_module) {
+      if (iter == &swig_module) {
         /* Our module is already in the list, so there's nothing more to do. */
         return;
       }
-      iter=iter->next;
-    } while (iter!= module_head);
-    
+      iter = iter->next;
+    } while (iter != module_head);
+
     /* otherwise we must add our module into the list */
     swig_module.next = module_head->next;
     module_head->next = &swig_module;
   }
-  
-  /* When multiple interpreters are used, a module could have already been initialized in
-       a different interpreter, but not yet have a pointer in this interpreter.
-       In this case, we do not want to continue adding types... everything should be
-       set up already */
-  if (init == 0) return;
-  
-  /* Now work on filling in swig_module.types */
+
+  /* When multiple interpreters are used, a module could have already been
+     initialized in a different interpreter, but not yet have a pointer in this
+     interpreter. In this case, we do not want to continue adding types...
+     everything should be set up already */
+  if (init == 0)
+    return;
+
+    /* Now work on filling in swig_module.types */
 #ifdef SWIGRUNTIME_DEBUG
   printf("SWIG_InitializeModule: size %d\n", swig_module.size);
 #endif
@@ -4165,14 +4233,16 @@ SWIG_InitializeModule(void *clientdata) {
     swig_type_info *type = 0;
     swig_type_info *ret;
     swig_cast_info *cast;
-    
+
 #ifdef SWIGRUNTIME_DEBUG
-    printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name);
+    printf("SWIG_InitializeModule: type %d %s\n", i,
+           swig_module.type_initial[i]->name);
 #endif
-    
+
     /* if there is another module already loaded */
     if (swig_module.next != &swig_module) {
-      type = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, swig_module.type_initial[i]->name);
+      type = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module,
+                                         swig_module.type_initial[i]->name);
     }
     if (type) {
       /* Overwrite clientdata field */
@@ -4182,13 +4252,14 @@ SWIG_InitializeModule(void *clientdata) {
       if (swig_module.type_initial[i]->clientdata) {
         type->clientdata = swig_module.type_initial[i]->clientdata;
 #ifdef SWIGRUNTIME_DEBUG
-        printf("SWIG_InitializeModule: found and overwrite type %s \n", type->name);
+        printf("SWIG_InitializeModule: found and overwrite type %s \n",
+               type->name);
 #endif
       }
     } else {
       type = swig_module.type_initial[i];
     }
-    
+
     /* Insert casting types */
     cast = swig_module.cast_initial[i];
     while (cast->type) {
@@ -4198,9 +4269,11 @@ SWIG_InitializeModule(void *clientdata) {
       printf("SWIG_InitializeModule: look cast %s\n", cast->type->name);
 #endif
       if (swig_module.next != &swig_module) {
-        ret = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module, cast->type->name);
+        ret = SWIG_MangledTypeQueryModule(swig_module.next, &swig_module,
+                                          cast->type->name);
 #ifdef SWIGRUNTIME_DEBUG
-        if (ret) printf("SWIG_InitializeModule: found cast %s\n", ret->name);
+        if (ret)
+          printf("SWIG_InitializeModule: found cast %s\n", ret->name);
 #endif
       }
       if (ret) {
@@ -4214,12 +4287,14 @@ SWIG_InitializeModule(void *clientdata) {
           /* Check for casting already in the list */
           swig_cast_info *ocast = SWIG_TypeCheck(ret->name, type);
 #ifdef SWIGRUNTIME_DEBUG
-          if (ocast) printf("SWIG_InitializeModule: skip old cast %s\n", ret->name);
+          if (ocast)
+            printf("SWIG_InitializeModule: skip old cast %s\n", ret->name);
 #endif
-          if (!ocast) ret = 0;
+          if (!ocast)
+            ret = 0;
         }
       }
-      
+
       if (!ret) {
 #ifdef SWIGRUNTIME_DEBUG
         printf("SWIG_InitializeModule: adding cast %s\n", cast->type->name);
@@ -4236,45 +4311,46 @@ SWIG_InitializeModule(void *clientdata) {
     swig_module.types[i] = type;
   }
   swig_module.types[i] = 0;
-  
+
 #ifdef SWIGRUNTIME_DEBUG
   printf("**** SWIG_InitializeModule: Cast List ******\n");
   for (i = 0; i < swig_module.size; ++i) {
     int j = 0;
     swig_cast_info *cast = swig_module.cast_initial[i];
-    printf("SWIG_InitializeModule: type %d %s\n", i, swig_module.type_initial[i]->name);
+    printf("SWIG_InitializeModule: type %d %s\n", i,
+           swig_module.type_initial[i]->name);
     while (cast->type) {
       printf("SWIG_InitializeModule: cast type %s\n", cast->type->name);
       cast++;
       ++j;
     }
-    printf("---- Total casts: %d\n",j);
+    printf("---- Total casts: %d\n", j);
   }
   printf("**** SWIG_InitializeModule: Cast List ******\n");
 #endif
 }
 
 /* This function will propagate the clientdata field of type to
-* any new swig_type_info structures that have been added into the list
-* of equivalent types.  It is like calling
-* SWIG_TypeClientData(type, clientdata) a second time.
-*/
-SWIGRUNTIME void
-SWIG_PropagateClientData(void) {
+ * any new swig_type_info structures that have been added into the list
+ * of equivalent types.  It is like calling
+ * SWIG_TypeClientData(type, clientdata) a second time.
+ */
+SWIGRUNTIME void SWIG_PropagateClientData(void) {
   size_t i;
   swig_cast_info *equiv;
   static int init_run = 0;
-  
-  if (init_run) return;
+
+  if (init_run)
+    return;
   init_run = 1;
-  
+
   for (i = 0; i < swig_module.size; i++) {
     if (swig_module.types[i]->clientdata) {
       equiv = swig_module.types[i]->cast;
       while (equiv) {
         if (!equiv->converter) {
           if (equiv->type && !equiv->type->clientdata)
-          SWIG_TypeClientData(equiv->type, swig_module.types[i]->clientdata);
+            SWIG_TypeClientData(equiv->type, swig_module.types[i]->clientdata);
         }
         equiv = equiv->next;
       }
@@ -4290,318 +4366,336 @@ SWIG_PropagateClientData(void) {
 }
 #endif
 
-
-
 #ifdef __cplusplus
 extern "C" {
 #endif
-  
-  /* Python-specific SWIG API */
-#define SWIG_newvarlink()                             SWIG_Python_newvarlink()
-#define SWIG_addvarlink(p, name, get_attr, set_attr)  SWIG_Python_addvarlink(p, name, get_attr, set_attr)
-#define SWIG_InstallConstants(d, constants)           SWIG_Python_InstallConstants(d, constants)
-  
-  /* -----------------------------------------------------------------------------
-   * global variable support code.
-   * ----------------------------------------------------------------------------- */
-  
-  typedef struct swig_globalvar {
-    char       *name;                  /* Name of global variable */
-    PyObject *(*get_attr)(void);       /* Return the current value */
-    int       (*set_attr)(PyObject *); /* Set the value */
-    struct swig_globalvar *next;
-  } swig_globalvar;
-  
-  typedef struct swig_varlinkobject {
-    PyObject_HEAD
-    swig_globalvar *vars;
-  } swig_varlinkobject;
-  
-  SWIGINTERN PyObject *
-  swig_varlink_repr(swig_varlinkobject *SWIGUNUSEDPARM(v)) {
+
+/* Python-specific SWIG API */
+#define SWIG_newvarlink() SWIG_Python_newvarlink()
+#define SWIG_addvarlink(p, name, get_attr, set_attr)                           \
+  SWIG_Python_addvarlink(p, name, get_attr, set_attr)
+#define SWIG_InstallConstants(d, constants)                                    \
+  SWIG_Python_InstallConstants(d, constants)
+
+/* -----------------------------------------------------------------------------
+ * global variable support code.
+ * -----------------------------------------------------------------------------
+ */
+
+typedef struct swig_globalvar {
+  char *name;                  /* Name of global variable */
+  PyObject *(*get_attr)(void); /* Return the current value */
+  int (*set_attr)(PyObject *); /* Set the value */
+  struct swig_globalvar *next;
+} swig_globalvar;
+
+typedef struct swig_varlinkobject {
+  PyObject_HEAD swig_globalvar *vars;
+} swig_varlinkobject;
+
+SWIGINTERN PyObject *swig_varlink_repr(swig_varlinkobject *SWIGUNUSEDPARM(v)) {
 #if PY_VERSION_HEX >= 0x03000000
-    return PyUnicode_InternFromString("<Swig global variables>");
+  return PyUnicode_InternFromString("<Swig global variables>");
 #else
-    return PyString_FromString("<Swig global variables>");
+  return PyString_FromString("<Swig global variables>");
 #endif
-  }
-  
-  SWIGINTERN PyObject *
-  swig_varlink_str(swig_varlinkobject *v) {
+}
+
+SWIGINTERN PyObject *swig_varlink_str(swig_varlinkobject *v) {
 #if PY_VERSION_HEX >= 0x03000000
-    PyObject *str = PyUnicode_InternFromString("(");
-    PyObject *tail;
-    PyObject *joined;
-    swig_globalvar *var;
-    for (var = v->vars; var; var=var->next) {
-      tail = PyUnicode_FromString(var->name);
+  PyObject *str = PyUnicode_InternFromString("(");
+  PyObject *tail;
+  PyObject *joined;
+  swig_globalvar *var;
+  for (var = v->vars; var; var = var->next) {
+    tail = PyUnicode_FromString(var->name);
+    joined = PyUnicode_Concat(str, tail);
+    Py_DecRef(str);
+    Py_DecRef(tail);
+    str = joined;
+    if (var->next) {
+      tail = PyUnicode_InternFromString(", ");
       joined = PyUnicode_Concat(str, tail);
       Py_DecRef(str);
       Py_DecRef(tail);
       str = joined;
-      if (var->next) {
-        tail = PyUnicode_InternFromString(", ");
-        joined = PyUnicode_Concat(str, tail);
-        Py_DecRef(str);
-        Py_DecRef(tail);
-        str = joined;
-      }
     }
-    tail = PyUnicode_InternFromString(")");
-    joined = PyUnicode_Concat(str, tail);
-    Py_DecRef(str);
-    Py_DecRef(tail);
-    str = joined;
+  }
+  tail = PyUnicode_InternFromString(")");
+  joined = PyUnicode_Concat(str, tail);
+  Py_DecRef(str);
+  Py_DecRef(tail);
+  str = joined;
 #else
-    PyObject *str = PyString_FromString("(");
-    swig_globalvar *var;
-    for (var = v->vars; var; var=var->next) {
-      PyString_ConcatAndDel(&str,PyString_FromString(var->name));
-      if (var->next) PyString_ConcatAndDel(&str,PyString_FromString(", "));
-    }
-    PyString_ConcatAndDel(&str,PyString_FromString(")"));
-#endif
-    return str;
+  PyObject *str = PyString_FromString("(");
+  swig_globalvar *var;
+  for (var = v->vars; var; var = var->next) {
+    PyString_ConcatAndDel(&str, PyString_FromString(var->name));
+    if (var->next)
+      PyString_ConcatAndDel(&str, PyString_FromString(", "));
   }
-  
-  SWIGINTERN int
-  swig_varlink_print(swig_varlinkobject *v, FILE *fp, int SWIGUNUSEDPARM(flags)) {
-    char *tmp;
-    PyObject *str = swig_varlink_str(v);
-    fprintf(fp,"Swig global variables ");
-    fprintf(fp,"%s\n", tmp = SWIG_Python_str_AsChar(str));
-    SWIG_Python_str_DelForPy3(tmp);
-    Py_DECREF(str);
-    return 0;
+  PyString_ConcatAndDel(&str, PyString_FromString(")"));
+#endif
+  return str;
+}
+
+SWIGINTERN int swig_varlink_print(swig_varlinkobject *v, FILE *fp,
+                                  int SWIGUNUSEDPARM(flags)) {
+  char *tmp;
+  PyObject *str = swig_varlink_str(v);
+  fprintf(fp, "Swig global variables ");
+  fprintf(fp, "%s\n", tmp = SWIG_Python_str_AsChar(str));
+  SWIG_Python_str_DelForPy3(tmp);
+  Py_DECREF(str);
+  return 0;
+}
+
+SWIGINTERN void swig_varlink_dealloc(swig_varlinkobject *v) {
+  swig_globalvar *var = v->vars;
+  while (var) {
+    swig_globalvar *n = var->next;
+    free(var->name);
+    free(var);
+    var = n;
   }
-  
-  SWIGINTERN void
-  swig_varlink_dealloc(swig_varlinkobject *v) {
-    swig_globalvar *var = v->vars;
-    while (var) {
-      swig_globalvar *n = var->next;
-      free(var->name);
-      free(var);
-      var = n;
+}
+
+SWIGINTERN PyObject *swig_varlink_getattr(swig_varlinkobject *v, char *n) {
+  PyObject *res = NULL;
+  swig_globalvar *var = v->vars;
+  while (var) {
+    if (strcmp(var->name, n) == 0) {
+      res = (*var->get_attr)();
+      break;
     }
+    var = var->next;
   }
-  
-  SWIGINTERN PyObject *
-  swig_varlink_getattr(swig_varlinkobject *v, char *n) {
-    PyObject *res = NULL;
-    swig_globalvar *var = v->vars;
-    while (var) {
-      if (strcmp(var->name,n) == 0) {
-        res = (*var->get_attr)();
-        break;
-      }
-      var = var->next;
-    }
-    if (res == NULL && !PyErr_Occurred()) {
-      PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n);
-    }
-    return res;
-  }
-  
-  SWIGINTERN int
-  swig_varlink_setattr(swig_varlinkobject *v, char *n, PyObject *p) {
-    int res = 1;
-    swig_globalvar *var = v->vars;
-    while (var) {
-      if (strcmp(var->name,n) == 0) {
-        res = (*var->set_attr)(p);
-        break;
-      }
-      var = var->next;
-    }
-    if (res == 1 && !PyErr_Occurred()) {
-      PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n);
+  if (res == NULL && !PyErr_Occurred()) {
+    PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n);
+  }
+  return res;
+}
+
+SWIGINTERN int swig_varlink_setattr(swig_varlinkobject *v, char *n,
+                                    PyObject *p) {
+  int res = 1;
+  swig_globalvar *var = v->vars;
+  while (var) {
+    if (strcmp(var->name, n) == 0) {
+      res = (*var->set_attr)(p);
+      break;
     }
-    return res;
-  }
-  
-  SWIGINTERN PyTypeObject*
-  swig_varlink_type(void) {
-    static char varlink__doc__[] = "Swig var link object";
-    static PyTypeObject varlink_type;
-    static int type_init = 0;
-    if (!type_init) {
-      const PyTypeObject tmp = {
-        /* PyObject header changed in Python 3 */
+    var = var->next;
+  }
+  if (res == 1 && !PyErr_Occurred()) {
+    PyErr_Format(PyExc_AttributeError, "Unknown C global variable '%s'", n);
+  }
+  return res;
+}
+
+SWIGINTERN PyTypeObject *swig_varlink_type(void) {
+  static char varlink__doc__[] = "Swig var link object";
+  static PyTypeObject varlink_type;
+  static int type_init = 0;
+  if (!type_init) {
+    const PyTypeObject tmp = {
+    /* PyObject header changed in Python 3 */
 #if PY_VERSION_HEX >= 0x03000000
-        PyVarObject_HEAD_INIT(NULL, 0)
+      PyVarObject_HEAD_INIT(NULL, 0)
 #else
-        PyObject_HEAD_INIT(NULL)
-        0,                                  /* ob_size */
-#endif
-        (char *)"swigvarlink",              /* tp_name */
-        sizeof(swig_varlinkobject),         /* tp_basicsize */
-        0,                                  /* tp_itemsize */
-        (destructor) swig_varlink_dealloc,  /* tp_dealloc */
-        (printfunc) swig_varlink_print,     /* tp_print */
-        (getattrfunc) swig_varlink_getattr, /* tp_getattr */
-        (setattrfunc) swig_varlink_setattr, /* tp_setattr */
-        0,                                  /* tp_compare */
-        (reprfunc) swig_varlink_repr,       /* tp_repr */
-        0,                                  /* tp_as_number */
-        0,                                  /* tp_as_sequence */
-        0,                                  /* tp_as_mapping */
-        0,                                  /* tp_hash */
-        0,                                  /* tp_call */
-        (reprfunc) swig_varlink_str,        /* tp_str */
-        0,                                  /* tp_getattro */
-        0,                                  /* tp_setattro */
-        0,                                  /* tp_as_buffer */
-        0,                                  /* tp_flags */
-        varlink__doc__,                     /* tp_doc */
-        0,                                  /* tp_traverse */
-        0,                                  /* tp_clear */
-        0,                                  /* tp_richcompare */
-        0,                                  /* tp_weaklistoffset */
+      PyObject_HEAD_INIT(NULL) 0, /* ob_size */
+#endif
+          (char *) "swigvarlink",        /* tp_name */
+      sizeof(swig_varlinkobject),        /* tp_basicsize */
+      0,                                 /* tp_itemsize */
+      (destructor)swig_varlink_dealloc,  /* tp_dealloc */
+      (printfunc)swig_varlink_print,     /* tp_print */
+      (getattrfunc)swig_varlink_getattr, /* tp_getattr */
+      (setattrfunc)swig_varlink_setattr, /* tp_setattr */
+      0,                                 /* tp_compare */
+      (reprfunc)swig_varlink_repr,       /* tp_repr */
+      0,                                 /* tp_as_number */
+      0,                                 /* tp_as_sequence */
+      0,                                 /* tp_as_mapping */
+      0,                                 /* tp_hash */
+      0,                                 /* tp_call */
+      (reprfunc)swig_varlink_str,        /* tp_str */
+      0,                                 /* tp_getattro */
+      0,                                 /* tp_setattro */
+      0,                                 /* tp_as_buffer */
+      0,                                 /* tp_flags */
+      varlink__doc__,                    /* tp_doc */
+      0,                                 /* tp_traverse */
+      0,                                 /* tp_clear */
+      0,                                 /* tp_richcompare */
+      0,                                 /* tp_weaklistoffset */
 #if PY_VERSION_HEX >= 0x02020000
-        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* tp_iter -> tp_weaklist */
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0, /* tp_iter -> tp_weaklist */
 #endif
 #if PY_VERSION_HEX >= 0x02030000
-        0,                                  /* tp_del */
+      0, /* tp_del */
 #endif
 #if PY_VERSION_HEX >= 0x02060000
-        0,                                  /* tp_version_tag */
+      0, /* tp_version_tag */
 #endif
 #if PY_VERSION_HEX >= 0x03040000
-        0,                                  /* tp_finalize */
+      0, /* tp_finalize */
 #endif
 #ifdef COUNT_ALLOCS
-        0,                                  /* tp_allocs */
-        0,                                  /* tp_frees */
-        0,                                  /* tp_maxalloc */
+      0, /* tp_allocs */
+      0, /* tp_frees */
+      0, /* tp_maxalloc */
 #if PY_VERSION_HEX >= 0x02050000
-        0,                                  /* tp_prev */
+      0, /* tp_prev */
 #endif
-        0                                   /* tp_next */
+      0 /* tp_next */
 #endif
-      };
-      varlink_type = tmp;
-      type_init = 1;
+    };
+    varlink_type = tmp;
+    type_init = 1;
 #if PY_VERSION_HEX < 0x02020000
-      varlink_type.ob_type = &PyType_Type;
+    varlink_type.ob_type = &PyType_Type;
 #else
-      if (PyType_Ready(&varlink_type) < 0)
+    if (PyType_Ready(&varlink_type) < 0)
       return NULL;
 #endif
+  }
+  return &varlink_type;
+}
+
+/* Create a variable linking object for use later */
+SWIGINTERN PyObject *SWIG_Python_newvarlink(void) {
+  swig_varlinkobject *result =
+      PyObject_NEW(swig_varlinkobject, swig_varlink_type());
+  if (result) {
+    result->vars = 0;
+  }
+  return ((PyObject *)result);
+}
+
+SWIGINTERN void SWIG_Python_addvarlink(PyObject *p, char *name,
+                                       PyObject *(*get_attr)(void),
+                                       int (*set_attr)(PyObject *p)) {
+  swig_varlinkobject *v = (swig_varlinkobject *)p;
+  swig_globalvar *gv = (swig_globalvar *)malloc(sizeof(swig_globalvar));
+  if (gv) {
+    size_t size = strlen(name) + 1;
+    gv->name = (char *)malloc(size);
+    if (gv->name) {
+      strncpy(gv->name, name, size);
+      gv->get_attr = get_attr;
+      gv->set_attr = set_attr;
+      gv->next = v->vars;
     }
-    return &varlink_type;
-  }
-  
-  /* Create a variable linking object for use later */
-  SWIGINTERN PyObject *
-  SWIG_Python_newvarlink(void) {
-    swig_varlinkobject *result = PyObject_NEW(swig_varlinkobject, swig_varlink_type());
-    if (result) {
-      result->vars = 0;
-    }
-    return ((PyObject*) result);
-  }
-  
-  SWIGINTERN void 
-  SWIG_Python_addvarlink(PyObject *p, char *name, PyObject *(*get_attr)(void), int (*set_attr)(PyObject *p)) {
-    swig_varlinkobject *v = (swig_varlinkobject *) p;
-    swig_globalvar *gv = (swig_globalvar *) malloc(sizeof(swig_globalvar));
-    if (gv) {
-      size_t size = strlen(name)+1;
-      gv->name = (char *)malloc(size);
-      if (gv->name) {
-        strncpy(gv->name,name,size);
-        gv->get_attr = get_attr;
-        gv->set_attr = set_attr;
-        gv->next = v->vars;
-      }
+  }
+  v->vars = gv;
+}
+
+SWIGINTERN PyObject *SWIG_globals(void) {
+  static PyObject *_SWIG_globals = 0;
+  if (!_SWIG_globals)
+    _SWIG_globals = SWIG_newvarlink();
+  return _SWIG_globals;
+}
+
+/* -----------------------------------------------------------------------------
+ * constants/methods manipulation
+ * -----------------------------------------------------------------------------
+ */
+
+/* Install Constants */
+SWIGINTERN void SWIG_Python_InstallConstants(PyObject *d,
+                                             swig_const_info constants[]) {
+  PyObject *obj = 0;
+  size_t i;
+  for (i = 0; constants[i].type; ++i) {
+    switch (constants[i].type) {
+    case SWIG_PY_POINTER:
+      obj = SWIG_InternalNewPointerObj(constants[i].pvalue,
+                                       *(constants[i]).ptype, 0);
+      break;
+    case SWIG_PY_BINARY:
+      obj = SWIG_NewPackedObj(constants[i].pvalue, constants[i].lvalue,
+                              *(constants[i].ptype));
+      break;
+    default:
+      obj = 0;
+      break;
     }
-    v->vars = gv;
-  }
-  
-  SWIGINTERN PyObject *
-  SWIG_globals(void) {
-    static PyObject *_SWIG_globals = 0; 
-    if (!_SWIG_globals) _SWIG_globals = SWIG_newvarlink();  
-    return _SWIG_globals;
-  }
-  
-  /* -----------------------------------------------------------------------------
-   * constants/methods manipulation
-   * ----------------------------------------------------------------------------- */
-  
-  /* Install Constants */
-  SWIGINTERN void
-  SWIG_Python_InstallConstants(PyObject *d, swig_const_info constants[]) {
-    PyObject *obj = 0;
-    size_t i;
-    for (i = 0; constants[i].type; ++i) {
-      switch(constants[i].type) {
-      case SWIG_PY_POINTER:
-        obj = SWIG_InternalNewPointerObj(constants[i].pvalue, *(constants[i]).ptype,0);
-        break;
-      case SWIG_PY_BINARY:
-        obj = SWIG_NewPackedObj(constants[i].pvalue, constants[i].lvalue, *(constants[i].ptype));
-        break;
-      default:
-        obj = 0;
-        break;
-      }
-      if (obj) {
-        PyDict_SetItemString(d, constants[i].name, obj);
-        Py_DECREF(obj);
-      }
+    if (obj) {
+      PyDict_SetItemString(d, constants[i].name, obj);
+      Py_DECREF(obj);
     }
   }
-  
-  /* -----------------------------------------------------------------------------*/
-  /* Fix SwigMethods to carry the callback ptrs when needed */
-  /* -----------------------------------------------------------------------------*/
-  
-  SWIGINTERN void
-  SWIG_Python_FixMethods(PyMethodDef *methods,
-    swig_const_info *const_table,
-    swig_type_info **types,
-    swig_type_info **types_initial) {
-    size_t i;
-    for (i = 0; methods[i].ml_name; ++i) {
-      const char *c = methods[i].ml_doc;
-      if (!c) continue;
-      c = strstr(c, "swig_ptr: ");
-      if (c) {
-        int j;
-        swig_const_info *ci = 0;
-        const char *name = c + 10;
-        for (j = 0; const_table[j].type; ++j) {
-          if (strncmp(const_table[j].name, name, 
-              strlen(const_table[j].name)) == 0) {
-            ci = &(const_table[j]);
-            break;
-          }
+}
+
+/* -----------------------------------------------------------------------------*/
+/* Fix SwigMethods to carry the callback ptrs when needed */
+/* -----------------------------------------------------------------------------*/
+
+SWIGINTERN void SWIG_Python_FixMethods(PyMethodDef *methods,
+                                       swig_const_info *const_table,
+                                       swig_type_info **types,
+                                       swig_type_info **types_initial) {
+  size_t i;
+  for (i = 0; methods[i].ml_name; ++i) {
+    const char *c = methods[i].ml_doc;
+    if (!c)
+      continue;
+    c = strstr(c, "swig_ptr: ");
+    if (c) {
+      int j;
+      swig_const_info *ci = 0;
+      const char *name = c + 10;
+      for (j = 0; const_table[j].type; ++j) {
+        if (strncmp(const_table[j].name, name, strlen(const_table[j].name)) ==
+            0) {
+          ci = &(const_table[j]);
+          break;
         }
-        if (ci) {
-          void *ptr = (ci->type == SWIG_PY_POINTER) ? ci->pvalue : 0;
-          if (ptr) {
-            size_t shift = (ci->ptype) - types;
-            swig_type_info *ty = types_initial[shift];
-            size_t ldoc = (c - methods[i].ml_doc);
-            size_t lptr = strlen(ty->name)+2*sizeof(void*)+2;
-            char *ndoc = (char*)malloc(ldoc + lptr + 10);
-            if (ndoc) {
-              char *buff = ndoc;
-              strncpy(buff, methods[i].ml_doc, ldoc);
-              buff += ldoc;
-              strncpy(buff, "swig_ptr: ", 10);
-              buff += 10;
-              SWIG_PackVoidPtr(buff, ptr, ty->name, lptr);
-              methods[i].ml_doc = ndoc;
-            }
+      }
+      if (ci) {
+        void *ptr = (ci->type == SWIG_PY_POINTER) ? ci->pvalue : 0;
+        if (ptr) {
+          size_t shift = (ci->ptype) - types;
+          swig_type_info *ty = types_initial[shift];
+          size_t ldoc = (c - methods[i].ml_doc);
+          size_t lptr = strlen(ty->name) + 2 * sizeof(void *) + 2;
+          char *ndoc = (char *)malloc(ldoc + lptr + 10);
+          if (ndoc) {
+            char *buff = ndoc;
+            strncpy(buff, methods[i].ml_doc, ldoc);
+            buff += ldoc;
+            strncpy(buff, "swig_ptr: ", 10);
+            buff += 10;
+            SWIG_PackVoidPtr(buff, ptr, ty->name, lptr);
+            methods[i].ml_doc = ndoc;
           }
         }
       }
     }
-  } 
-  
+  }
+}
+
 #ifdef __cplusplus
 }
 #endif
@@ -4614,27 +4708,26 @@ extern "C" {
 extern "C"
 #endif
 
-SWIGEXPORT 
+    SWIGEXPORT
 #if PY_VERSION_HEX >= 0x03000000
-PyObject*
+        PyObject *
 #else
 void
 #endif
-SWIG_init(void) {
+        SWIG_init(void) {
   PyObject *m, *d, *md;
 #if PY_VERSION_HEX >= 0x03000000
   static struct PyModuleDef SWIG_module = {
-# if PY_VERSION_HEX >= 0x03020000
+#if PY_VERSION_HEX >= 0x03020000
     PyModuleDef_HEAD_INIT,
-# else
+#else
     {
-      PyObject_HEAD_INIT(NULL)
-      NULL, /* m_init */
-      0,    /* m_index */
-      NULL, /* m_copy */
+        PyObject_HEAD_INIT(NULL) NULL, /* m_init */
+        0,                             /* m_index */
+        NULL,                          /* m_copy */
     },
-# endif
-    (char *) SWIG_name,
+#endif
+    (char *)SWIG_name,
     NULL,
     -1,
     SwigMethods,
@@ -4644,21 +4737,16 @@ SWIG_init(void) {
     NULL
   };
 #endif
-  
+
 #if defined(SWIGPYTHON_BUILTIN)
-  static SwigPyClientData SwigPyObject_clientdata = {
-    0, 0, 0, 0, 0, 0, 0
-  };
+  static SwigPyClientData SwigPyObject_clientdata = {0, 0, 0, 0, 0, 0, 0};
   static PyGetSetDef this_getset_def = {
-    (char *)"this", &SwigPyBuiltin_ThisClosure, NULL, NULL, NULL
-  };
-  static SwigPyGetSet thisown_getset_closure = {
-    (PyCFunction) SwigPyObject_own,
-    (PyCFunction) SwigPyObject_own
-  };
+      (char *)"this", &SwigPyBuiltin_ThisClosure, NULL, NULL, NULL};
+  static SwigPyGetSet thisown_getset_closure = {(PyCFunction)SwigPyObject_own,
+                                                (PyCFunction)SwigPyObject_own};
   static PyGetSetDef thisown_getset_def = {
-    (char *)"thisown", SwigPyBuiltin_GetterClosure, SwigPyBuiltin_SetterClosure, NULL, &thisown_getset_closure
-  };
+      (char *)"thisown", SwigPyBuiltin_GetterClosure,
+      SwigPyBuiltin_SetterClosure, NULL, &thisown_getset_closure};
   PyObject *metatype_args;
   PyTypeObject *builtin_pytype;
   int builtin_base_count;
@@ -4672,83 +4760,85 @@ SWIG_init(void) {
   PyObject *thisown_descr;
   PyObject *self = 0;
   int i;
-  
+
   (void)builtin_pytype;
   (void)builtin_base_count;
   (void)builtin_basetype;
   (void)tuple;
   (void)static_getset;
   (void)self;
-  
+
   /* metatype is used to implement static member variables. */
   metatype_args = Py_BuildValue("(s(O){})", "SwigPyObjectType", &PyType_Type);
   assert(metatype_args);
-  metatype = (PyTypeObject *) PyType_Type.tp_call((PyObject *) &PyType_Type, metatype_args, NULL);
+  metatype = (PyTypeObject *)PyType_Type.tp_call((PyObject *)&PyType_Type,
+                                                 metatype_args, NULL);
   assert(metatype);
   Py_DECREF(metatype_args);
-  metatype->tp_setattro = (setattrofunc) &SwigPyObjectType_setattro;
+  metatype->tp_setattro = (setattrofunc)&SwigPyObjectType_setattro;
   assert(PyType_Ready(metatype) >= 0);
 #endif
-  
+
   /* Fix SwigMethods to carry the callback ptrs when needed */
-  SWIG_Python_FixMethods(SwigMethods, swig_const_table, swig_types, swig_type_initial);
-  
+  SWIG_Python_FixMethods(SwigMethods, swig_const_table, swig_types,
+                         swig_type_initial);
+
 #if PY_VERSION_HEX >= 0x03000000
   m = PyModule_Create(&SWIG_module);
 #else
-  m = Py_InitModule((char *) SWIG_name, SwigMethods);
+  m = Py_InitModule((char *)SWIG_name, SwigMethods);
 #endif
-  
+
   md = d = PyModule_GetDict(m);
   (void)md;
-  
+
   SWIG_InitializeModule(0);
-  
+
 #ifdef SWIGPYTHON_BUILTIN
   SwigPyObject_stype = SWIG_MangledTypeQuery("_p_SwigPyObject");
   assert(SwigPyObject_stype);
-  cd = (SwigPyClientData*) SwigPyObject_stype->clientdata;
+  cd = (SwigPyClientData *)SwigPyObject_stype->clientdata;
   if (!cd) {
     SwigPyObject_stype->clientdata = &SwigPyObject_clientdata;
     SwigPyObject_clientdata.pytype = SwigPyObject_TypeOnce();
-  } else if (SwigPyObject_TypeOnce()->tp_basicsize != cd->pytype->tp_basicsize) {
-    PyErr_SetString(PyExc_RuntimeError, "Import error: attempted to load two incompatible swig-generated modules.");
-# if PY_VERSION_HEX >= 0x03000000
+  } else if (SwigPyObject_TypeOnce()->tp_basicsize !=
+             cd->pytype->tp_basicsize) {
+    PyErr_SetString(PyExc_RuntimeError, "Import error: attempted to load two "
+                                        "incompatible swig-generated modules.");
+#if PY_VERSION_HEX >= 0x03000000
     return NULL;
-# else
+#else
     return;
-# endif
+#endif
   }
-  
+
   /* All objects have a 'this' attribute */
   this_descr = PyDescr_NewGetSet(SwigPyObject_type(), &this_getset_def);
   (void)this_descr;
-  
+
   /* All objects have a 'thisown' attribute */
   thisown_descr = PyDescr_NewGetSet(SwigPyObject_type(), &thisown_getset_def);
   (void)thisown_descr;
-  
+
   public_interface = PyList_New(0);
   public_symbol = 0;
   (void)public_symbol;
-  
+
   PyDict_SetItemString(md, "__all__", public_interface);
   Py_DECREF(public_interface);
   for (i = 0; SwigMethods[i].ml_name != NULL; ++i)
-  SwigPyBuiltin_AddPublicSymbol(public_interface, SwigMethods[i].ml_name);
+    SwigPyBuiltin_AddPublicSymbol(public_interface, SwigMethods[i].ml_name);
   for (i = 0; swig_const_table[i].name != 0; ++i)
-  SwigPyBuiltin_AddPublicSymbol(public_interface, swig_const_table[i].name);
+    SwigPyBuiltin_AddPublicSymbol(public_interface, swig_const_table[i].name);
 #endif
-  
-  SWIG_InstallConstants(d,swig_const_table);
-  
-  
+
+  SWIG_InstallConstants(d, swig_const_table);
+
   import_array();
-  
+
 #if PY_VERSION_HEX >= 0x03000000
   return m;
 #else
   return;
 #endif
 }
-
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe.c b/hpvm/test/hpvm-cava/src/cam_pipe.c
index 7874ff9d529afebc40d1660637e85b3a1e00f23e..cdeaf393320121706d13d423212896e2551142c8 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe.c
+++ b/hpvm/test/hpvm-cava/src/cam_pipe.c
@@ -1,11 +1,11 @@
+#include "cam_pipe_utility.h"
+#include "dma_interface.h"
+#include "load_cam_model.h"
+#include "pipe_stages.h"
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
-#include "pipe_stages.h"
-#include "load_cam_model.h"
-#include "cam_pipe_utility.h"
-#include "dma_interface.h"
 #ifdef DMA_MODE
 #include "gem5_harness.h"
 #endif
@@ -13,7 +13,7 @@
 // FIXME: Include gem5/dma_interface.cc/h separately
 #ifndef DMA_INTERFACE_V3
 #define DMA_INTERFACE_V3
-#endif//DMA_INTERFACE_V3
+#endif // DMA_INTERFACE_V3
 
 ///////////////////////////////////////////////////////////////
 // Camera Model Parameters
@@ -71,7 +71,8 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   uint8_t *acc_input, *acc_result;
   float *acc_input_scaled, *acc_result_scaled;
   float *host_TsTw, *host_ctrl_pts, *host_weights, *host_coefs, *host_tone_map;
-  float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map, *acc_l2_dist;
+  float *acc_TsTw, *acc_ctrl_pts, *acc_weights, *acc_coefs, *acc_tone_map,
+      *acc_l2_dist;
 
   strcat(cam_model_path, "cam_models/NikonD7000/");
 
@@ -84,20 +85,25 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   host_coefs = get_coefs(cam_model_path, num_ctrl_pts);
   host_tone_map = get_tone_map(cam_model_path);
 
-  acc_input = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
-  acc_result = (uint8_t*) malloc_aligned(sizeof(uint8_t) * row_size * col_size * CHAN_SIZE);
-  acc_input_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
-  acc_result_scaled = (float*) malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
-  acc_TsTw = (float*) malloc_aligned(sizeof(float) * 9);
-  acc_ctrl_pts = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
-  acc_weights = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
-  acc_coefs = (float*) malloc_aligned(sizeof(float) * 12);
-  acc_tone_map = (float*) malloc_aligned(sizeof(float) * 256 * CHAN_SIZE);
-  acc_l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);
+  acc_input = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size *
+                                        CHAN_SIZE);
+  acc_result = (uint8_t *)malloc_aligned(sizeof(uint8_t) * row_size * col_size *
+                                         CHAN_SIZE);
+  acc_input_scaled =
+      (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_result_scaled =
+      (float *)malloc_aligned(sizeof(float) * row_size * col_size * CHAN_SIZE);
+  acc_TsTw = (float *)malloc_aligned(sizeof(float) * 9);
+  acc_ctrl_pts =
+      (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_weights =
+      (float *)malloc_aligned(sizeof(float) * num_ctrl_pts * CHAN_SIZE);
+  acc_coefs = (float *)malloc_aligned(sizeof(float) * 12);
+  acc_tone_map = (float *)malloc_aligned(sizeof(float) * 256 * CHAN_SIZE);
+  acc_l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);
 
   // Load camera model parameters for the ISP
-  MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw,
-                     sizeof(float) * 9);
+  MAP_ARRAY_TO_ACCEL(ISP, "host_TsTw", host_TsTw, sizeof(float) * 9);
   MAP_ARRAY_TO_ACCEL(ISP, "host_ctrl_pts", host_ctrl_pts,
                      sizeof(float) * num_ctrl_pts * CHAN_SIZE);
   MAP_ARRAY_TO_ACCEL(ISP, "host_weights", host_weights,
@@ -136,4 +142,3 @@ void cam_pipe(uint8_t *host_input, uint8_t *host_result, int row_size,
   free(acc_tone_map);
   free(acc_l2_dist);
 }
-
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
index f806e9ee1a2e288fabcb8ad658a47c3919fbb661..864f02d5b28f2c4738279cf66cba5f4312c2a3de 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
+++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.c
@@ -1,6 +1,6 @@
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
 
 #include "cam_pipe_utility.h"
 //#include "pipe_stages.h"
@@ -26,10 +26,11 @@ uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size) {
   return image;
 }
 
-void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int col_size) {
+void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
+                           int col_size) {
   FILE *fp = fopen(file_path, "w");
 
-  int shape[3] = { row_size, col_size, CHAN_SIZE };
+  int shape[3] = {row_size, col_size, CHAN_SIZE};
   fwrite(shape, sizeof(int), 3, fp);
 
   int size = row_size * col_size * CHAN_SIZE;
@@ -40,8 +41,8 @@ void write_image_to_binary(char *file_path, uint8_t *image, int row_size, int co
 float *transpose_mat(float *inmat, int width, int height) {
   // Define vectors
   float *outmat;
-  int err =
-      posix_memalign((void **)&outmat, CACHELINE_SIZE, sizeof(float) * height * width);
+  int err = posix_memalign((void **)&outmat, CACHELINE_SIZE,
+                           sizeof(float) * height * width);
   assert(err == 0 && "Failed to allocate memory!");
 
   // Transpose the matrix
@@ -71,7 +72,7 @@ void convert_chw_to_hwc(uint8_t *input, int row_size, int col_size,
                         uint8_t **result) {
   if (*result == NULL) {
     *result = (uint8_t *)malloc_aligned(row_size * col_size * CHAN_SIZE *
-                                      sizeof(uint8_t));
+                                        sizeof(uint8_t));
   }
   ARRAY_3D(uint8_t, _input, input, row_size, col_size);
   ARRAY_3D(uint8_t, _result, *result, col_size, CHAN_SIZE);
diff --git a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
index b4fb6cde0c438b23c2b596cf0418953aaedca501..b61b7cc9b52aa59522f93661895fca960b947f17 100644
--- a/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
+++ b/hpvm/test/hpvm-cava/src/cam_pipe_utility.h
@@ -1,8 +1,8 @@
 #ifndef _CAM_PIPE_UTILITY_H_
 #define _CAM_PIPE_UTILITY_H_
 
-#include "utility.h"
 #include "pipe_stages.h"
+#include "utility.h"
 
 uint8_t *read_image_from_binary(char *file_path, int *row_size, int *col_size);
 void write_image_to_binary(char *file_path, uint8_t *image, int row_size,
diff --git a/hpvm/test/hpvm-cava/src/defs.h b/hpvm/test/hpvm-cava/src/defs.h
index ccc8acc857c36fd13115670932a38dc3a406dc29..0fa95ef3d2ea55c67a921e0bc5fc8a6ec6ba949f 100644
--- a/hpvm/test/hpvm-cava/src/defs.h
+++ b/hpvm/test/hpvm-cava/src/defs.h
@@ -10,46 +10,46 @@ typedef unsigned long uint64_t;
 
 // Debugging message macros.
 #if DEBUG_LEVEL >= 1
-  #define INFO_MSG(args...) printf(args)
-
-  #if DEBUG_LEVEL >= 2
-    #define PRINT_MSG(args...) printf(args)
-    #define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
-        print_debug(hid, rows, cols, num_cols)
-    #define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
-        print_debug4d(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
-        print_debug4d_fp16(hid, num, height, rows, cols)
-
-    #if DEBUG_LEVEL >= 3
-      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
-          print_debug(hid, rows, cols, num_cols)
-      #define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
-          print_debug4d(hid, rows, cols, height)
-      #define PRINT_MSG_V(args...) printf(args)
-    #else
-      #define PRINT_DEBUG_V(hid, rows, cols, num_cols)
-      #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-      #define PRINT_MSG_V(args...)
-    #endif
-  #else
-    #define PRINT_MSG(args...)
-    #define PRINT_DEBUG(hid, rows, cols, num_cols)
-    #define PRINT_DEBUG4D(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-    #define PRINT_DEBUG_V(hid, rows, cols, height)
-    #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-    #define PRINT_MSG_V(args...)
-  #endif
+#define INFO_MSG(args...) printf(args)
+
+#if DEBUG_LEVEL >= 2
+#define PRINT_MSG(args...) printf(args)
+#define PRINT_DEBUG(hid, rows, cols, num_cols)                                 \
+  print_debug(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D(hid, rows, cols, height)                                 \
+  print_debug4d(hid, rows, cols, height)
+#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)                       \
+  print_debug4d_fp16(hid, num, height, rows, cols)
+
+#if DEBUG_LEVEL >= 3
+#define PRINT_DEBUG_V(hid, rows, cols, num_cols)                               \
+  print_debug(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)                               \
+  print_debug4d(hid, rows, cols, height)
+#define PRINT_MSG_V(args...) printf(args)
 #else
-  #define INFO_MSG(args...)
-  #define PRINT_DEBUG(hid, rows, cols, num_cols)
-  #define PRINT_DEBUG4D(hid, rows, cols, height)
-  #define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
-  #define PRINT_MSG(args...)
-  #define PRINT_DEBUG_V(hid, rows, cols, height)
-  #define PRINT_DEBUG4D_V(hid, rows, cols, height)
-  #define PRINT_MSG_V(args...)
+#define PRINT_DEBUG_V(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)
+#define PRINT_MSG_V(args...)
+#endif
+#else
+#define PRINT_MSG(args...)
+#define PRINT_DEBUG(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D(hid, rows, cols, height)
+#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+#define PRINT_DEBUG_V(hid, rows, cols, height)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)
+#define PRINT_MSG_V(args...)
+#endif
+#else
+#define INFO_MSG(args...)
+#define PRINT_DEBUG(hid, rows, cols, num_cols)
+#define PRINT_DEBUG4D(hid, rows, cols, height)
+#define PRINT_DEBUG4D_FP16(hid, num, height, rows, cols)
+#define PRINT_MSG(args...)
+#define PRINT_DEBUG_V(hid, rows, cols, height)
+#define PRINT_DEBUG4D_V(hid, rows, cols, height)
+#define PRINT_MSG_V(args...)
 #endif
 
 #define STRING(arg) #arg
@@ -72,9 +72,9 @@ typedef unsigned long uint64_t;
 #define max3(e0, e1, e2) max2(max2(e0, e1), e2)
 #define max4(e0, e1, e2, e3) max2(max2(e0, e1), max2(e2, e3))
 #define max8(e0, e1, e2, e3, e4, e5, e6, e7)                                   \
-    max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
+  max2(max4(e0, e1, e2, e3), max4(e4, e5, e6, e7))
 #define max9(e0, e1, e2, e3, e4, e5, e6, e7, e8)                               \
-    max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
+  max2(max8(e0, e1, e2, e3, e4, e5, e6, e7), e8)
 
 #define min2(A, B) (((A) < (B)) ? (A) : (B))
 
@@ -92,7 +92,8 @@ typedef unsigned long uint64_t;
 //  If GEM5_HARNESS is defined:
 //
 //     MAP_ARRAY_TO_ACCEL(myReqCode, myArrayName, myArrayPtr, mySize)
-//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr, mySize)
+//        ===>   mapArrayToAccelerator(myReqCode, myArrayName, myArrayPtr,
+//        mySize)
 //
 //     INVOKE_KERNEL(myReqCode, kernelFuncName, args...)
 //        ===>   invokeAcceleratorAndBlock(myReqCode)
@@ -107,69 +108,69 @@ typedef unsigned long uint64_t;
 #ifdef GEM5_HARNESS
 
 #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-    mapArrayToAccelerator(req_code, name, base_addr, size)
+  mapArrayToAccelerator(req_code, name, base_addr, size)
 #define INVOKE_KERNEL(req_code, kernel_ptr, args...)                           \
-    do {                                                                       \
-        UNUSED(kernel_ptr);                                                    \
-        invokeAcceleratorAndBlock(req_code);                                   \
-    } while (0)
+  do {                                                                         \
+    UNUSED(kernel_ptr);                                                        \
+    invokeAcceleratorAndBlock(req_code);                                       \
+  } while (0)
 #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-    do {                                                                       \
-        UNUSED(kernel_ptr);                                                    \
-        invokeAcceleratorAndReturn2(req_code, finish_flag);                    \
-    } while (0)
+  do {                                                                         \
+    UNUSED(kernel_ptr);                                                        \
+    invokeAcceleratorAndReturn2(req_code, finish_flag);                        \
+  } while (0)
 
 #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);       \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, false, false);           \
+  } while (0)
 #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);        \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, true, false);            \
+  } while (0)
 #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);        \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, false, true);            \
+  } while (0)
 #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);         \
-    } while (0)
+  do {                                                                         \
+    invokeAladdinTrafficGenAndBlock(start_addr, size, true, true);             \
+  } while (0)
 
 #else
 
 #define MAP_ARRAY_TO_ACCEL(req_code, name, base_addr, size)                    \
-    do {                                                                       \
-        INFO_MSG("Mapping array %s @ %p, size %d.\n",                          \
-                 name, (void*)base_addr, (int)(size));                         \
-        UNUSED(req_code);                                                      \
-        UNUSED(name);                                                          \
-        UNUSED(base_addr);                                                     \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    INFO_MSG("Mapping array %s @ %p, size %d.\n", name, (void *)base_addr,     \
+             (int)(size));                                                     \
+    UNUSED(req_code);                                                          \
+    UNUSED(name);                                                              \
+    UNUSED(base_addr);                                                         \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_KERNEL(req_code, kernel_ptr, args...) kernel_ptr(args)
 #define INVOKE_KERNEL_NOBLOCK(req_code, finish_flag, kernel_ptr, args...)      \
-    kernel_ptr(args)
+  kernel_ptr(args)
 #define INVOKE_DMA_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_DMA_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_ACP_READ_TRAFFIC_GEN(start_addr, size)                          \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 #define INVOKE_ACP_WRITE_TRAFFIC_GEN(start_addr, size)                         \
-    do {                                                                       \
-        UNUSED(start_addr);                                                    \
-        UNUSED(size);                                                          \
-    } while (0)
+  do {                                                                         \
+    UNUSED(start_addr);                                                        \
+    UNUSED(size);                                                              \
+  } while (0)
 
 #endif
 
@@ -177,14 +178,14 @@ typedef unsigned long uint64_t;
 //
 // This assumes that the current name of the base pointer is also the name of
 // the array in the top level function of the dynamic trace. THIS IS VERY
-// IMPORTANT - if the argument passed to a top level function has been renamed in
-// the function, then this WILL NOT WORK!
+// IMPORTANT - if the argument passed to a top level function has been renamed
+// in the function, then this WILL NOT WORK!
 //
 // MAP_ARRAY(myReqCode, myArray, mySize)
 //    ===>   MAP_ARRAY_TO_ACCEL(myReqCode, "myArray", myArray, mySize)
 #define MAP_ARRAY(req_code, name_and_base_addr, size)                          \
-    MAP_ARRAY_TO_ACCEL(                                                        \
-            req_code, STRING(name_and_base_addr), name_and_base_addr, size)
+  MAP_ARRAY_TO_ACCEL(req_code, STRING(name_and_base_addr), name_and_base_addr, \
+                     size)
 
 // Use these convenience macros to cast a raw pointer into a multidimensional
 // variable-length array, which lets us use [] notation inside of the ugly
@@ -202,23 +203,24 @@ typedef unsigned long uint64_t;
 //
 //   And so on...
 #define ARRAY_1D(TYPE, output_array_name, input_array_name)                    \
-    TYPE* output_array_name = (TYPE*)input_array_name
+  TYPE *output_array_name = (TYPE *)input_array_name
 
 #define ARRAY_2D(TYPE, output_array_name, input_array_name, DIM_1)             \
-    TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
+  TYPE(*output_array_name)[DIM_1] = (TYPE(*)[DIM_1])input_array_name
 
 #define ARRAY_3D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2)      \
-    TYPE(*output_array_name)[DIM_1][DIM_2] =                                   \
-        (TYPE(*)[DIM_1][DIM_2])input_array_name
-
-#define ARRAY_4D(                                                              \
-    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3)            \
-        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3] =                        \
-            (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
-
-#define ARRAY_5D(                                                              \
-    TYPE, output_array_name, input_array_name, DIM_1, DIM_2, DIM_3, DIM_4)     \
-        TYPE(*output_array_name)[DIM_1][DIM_2][DIM_3][DIM_4] =                 \
-            (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
+  TYPE(*output_array_name)                                                     \
+  [DIM_1][DIM_2] = (TYPE(*)[DIM_1][DIM_2])input_array_name
+
+#define ARRAY_4D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2,      \
+                 DIM_3)                                                        \
+  TYPE(*output_array_name)                                                     \
+  [DIM_1][DIM_2][DIM_3] = (TYPE(*)[DIM_1][DIM_2][DIM_3])input_array_name
+
+#define ARRAY_5D(TYPE, output_array_name, input_array_name, DIM_1, DIM_2,      \
+                 DIM_3, DIM_4)                                                 \
+  TYPE(*output_array_name)                                                     \
+  [DIM_1][DIM_2][DIM_3][DIM_4] =                                               \
+      (TYPE(*)[DIM_1][DIM_2][DIM_3][DIM_4])input_array_name
 
 #endif
diff --git a/hpvm/test/hpvm-cava/src/dma_interface.c b/hpvm/test/hpvm-cava/src/dma_interface.c
index 81bce54469886153170f994a77250a784cc9b7d7..68698635a4fceb4fe67e323bd0f354bd70bca99d 100644
--- a/hpvm/test/hpvm-cava/src/dma_interface.c
+++ b/hpvm/test/hpvm-cava/src/dma_interface.c
@@ -1,6 +1,6 @@
+#include "dma_interface.h"
 #include <assert.h>
 #include <string.h>
-#include "dma_interface.h"
 
 // All _dmaImplN functions must be always inlined or we'll get extra functions
 // in the trace.
@@ -10,22 +10,22 @@
 // Starting with version 3, all versioning will be distinguished by the return
 // value of the DMA functions.
 
-__attribute__((__always_inline__))
-int _dmaImpl3(void* dst_addr, void* src_addr, size_t size) {
+__attribute__((__always_inline__)) int _dmaImpl3(void *dst_addr, void *src_addr,
+                                                 size_t size) {
   assert(size > 0);
   memmove(dst_addr, src_addr, size);
   return 3;
 }
 
-int dmaLoad(void* dst_addr, void* src_host_addr, size_t size) {
+int dmaLoad(void *dst_addr, void *src_host_addr, size_t size) {
   return _dmaImpl3(dst_addr, src_host_addr, size);
 }
 
-int dmaStore(void* dst_host_addr, void* src_addr, size_t size) {
+int dmaStore(void *dst_host_addr, void *src_addr, size_t size) {
   return _dmaImpl3(dst_host_addr, src_addr, size);
 }
 
-int setReadyBits(void* start_addr, size_t size, unsigned value) {
+int setReadyBits(void *start_addr, size_t size, unsigned value) {
   asm("");
   return 0;
 }
@@ -35,39 +35,37 @@ int setReadyBits(void* start_addr, size_t size, unsigned value) {
 // With version 2 and earlier, we return (void*)NULL and use the number of
 // function arguments to distinguish the DMA functions.
 
-__attribute__((__always_inline__))
-void* _dmaImpl2(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+__attribute__((__always_inline__)) void *
+_dmaImpl2(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
   assert(size > 0);
   memmove(base_addr + dst_off, base_addr + src_off, size);
   return NULL;
 }
 
-void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
   return _dmaImpl2(base_addr, src_off, dst_off, size);
 }
 
-void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size) {
+void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size) {
   return _dmaImpl2(base_addr, src_off, dst_off, size);
 }
 
 #else
 
-__attribute__((__always_inline__))
-void* _dmaImpl1(void* base_addr, size_t offset, size_t size) {
+__attribute__((__always_inline__)) void *_dmaImpl1(void *base_addr,
+                                                   size_t offset, size_t size) {
   assert(size > 0);
   asm("");
   return NULL;
 }
 
-void* dmaLoad(void* addr, size_t offset, size_t size) {
+void *dmaLoad(void *addr, size_t offset, size_t size) {
   return _dmaImpl1(addr, offset, size);
 }
 
-void* dmaStore(void* addr, size_t offset, size_t size) {
+void *dmaStore(void *addr, size_t offset, size_t size) {
   return _dmaImpl1(addr, offset, size);
 }
 #endif
 
-void dmaFence() {
-  asm("");
-}
+void dmaFence() { asm(""); }
diff --git a/hpvm/test/hpvm-cava/src/dma_interface.h b/hpvm/test/hpvm-cava/src/dma_interface.h
index f23234eede4df99db84b144646530dfe240c6e62..771ece523824cff5923581aca671ab7d26fae706 100644
--- a/hpvm/test/hpvm-cava/src/dma_interface.h
+++ b/hpvm/test/hpvm-cava/src/dma_interface.h
@@ -10,12 +10,12 @@
 // Version 3 of the DMA interface enables memcpy operations from arbitrary
 // source and destination addresses.
 
-int dmaLoad(void* dst_addr, void* src_host_addr, size_t size);
-int dmaStore(void* dst_host_addr, void* src_addr, size_t size);
+int dmaLoad(void *dst_addr, void *src_host_addr, size_t size);
+int dmaStore(void *dst_host_addr, void *src_addr, size_t size);
 
 // The user can explicitly toggle the state of ready bits, if ready mode is
 // enabled. This requires support from DMA v3.
-int setReadyBits(void* start_addr, size_t size, unsigned value);
+int setReadyBits(void *start_addr, size_t size, unsigned value);
 
 #elif defined(DMA_INTERFACE_V2)
 
@@ -26,17 +26,18 @@ int setReadyBits(void* start_addr, size_t size, unsigned value);
 // actually copied from source to destination (the memory copy will not show up
 // in the trace).
 
-void* dmaLoad(void* base_addr, size_t src_off, size_t dst_off, size_t size);
-void* dmaStore(void* base_addr, size_t src_off, size_t dst_off, size_t size);
+void *dmaLoad(void *base_addr, size_t src_off, size_t dst_off, size_t size);
+void *dmaStore(void *base_addr, size_t src_off, size_t dst_off, size_t size);
 
 #else
 
 #warning "DMA interface v1 is deprecated!"
 
-// Version 1 of the DMA interface is now deprecated and will be removed entirely.
+// Version 1 of the DMA interface is now deprecated and will be removed
+// entirely.
 
-void* dmaLoad(void* addr, size_t offset, size_t size);
-void* dmaStore(void* addr, size_t offset, size_t size);
+void *dmaLoad(void *addr, size_t offset, size_t size);
+void *dmaStore(void *addr, size_t offset, size_t size);
 
 #endif
 void dmaFence();
diff --git a/hpvm/test/hpvm-cava/src/load_cam_model.c b/hpvm/test/hpvm-cava/src/load_cam_model.c
index 124fe0b7d175c2655feac562ecd6e2a5b73cc96a..3ef24cf429e31ac8f35744005d82b57ac0200611 100644
--- a/hpvm/test/hpvm-cava/src/load_cam_model.c
+++ b/hpvm/test/hpvm-cava/src/load_cam_model.c
@@ -1,13 +1,13 @@
+#include "load_cam_model.h"
+#include "pipe_stages.h"
+#include "utility.h"
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <assert.h>
-#include "utility.h"
-#include "pipe_stages.h"
-#include "load_cam_model.h"
 
 // Get color space transform
-float* get_Ts(char* cam_model_path) {
+float *get_Ts(char *cam_model_path) {
   float *Ts;
   int err = posix_memalign((void **)&Ts, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -32,7 +32,7 @@ float* get_Ts(char* cam_model_path) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str); 
+      line_data[i] = atof(str);
       str = strtok(NULL, " \n");
       i++;
     }
@@ -50,7 +50,7 @@ float* get_Ts(char* cam_model_path) {
 }
 
 // Get white balance transform
-float* get_Tw(char* cam_model_path, int wb_index) {
+float *get_Tw(char *cam_model_path, int wb_index) {
   float *Tw;
   int err = posix_memalign((void **)&Tw, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -62,7 +62,7 @@ float* get_Tw(char* cam_model_path, int wb_index) {
 
   // Calculate base for the white balance transform selected
   // For more details see the camera model readme
-  int wb_base  = 8 + 5*(wb_index-1);
+  int wb_base = 8 + 5 * (wb_index - 1);
 
   // Open file for reading
   // Open file for reading
@@ -81,15 +81,15 @@ float* get_Tw(char* cam_model_path, int wb_index) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str); 
+      line_data[i] = atof(str);
       str = strtok(NULL, " \n");
       i++;
     }
 
     if (line_idx == wb_base) {
       // Convert the white balance vector into a diagaonal matrix
-      for (int i=0; i<3; i++) {
-        for (int j=0; j<3; j++) {
+      for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
           if (i == j) {
             Tw[i * 3 + j] = line_data[i];
           } else {
@@ -105,9 +105,8 @@ float* get_Tw(char* cam_model_path, int wb_index) {
   return Tw;
 }
 
-
 // Get combined transforms for checking
-float* get_TsTw(char* cam_model_path, int wb_index) {
+float *get_TsTw(char *cam_model_path, int wb_index) {
   float *TsTw;
   int err = posix_memalign((void **)&TsTw, CACHELINE_SIZE, sizeof(float) * 9);
   assert(err == 0 && "Failed to allocate memory!");
@@ -119,7 +118,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) {
 
   // Calculate base for the white balance transform selected
   // For more details see the camera model readme
-  int wb_base  = 5 + 5*(wb_index-1);
+  int wb_base = 5 + 5 * (wb_index - 1);
 
   // Open file for reading
   char file_name[] = "raw2jpg_transform.txt";
@@ -137,7 +136,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) {
     str = strtok(line, " \n");
     int i = 0;
     while (str != NULL) {
-      line_data[i] = atof(str); 
+      line_data[i] = atof(str);
       str = strtok(NULL, " \n");
       i++;
     }
@@ -155,7 +154,7 @@ float* get_TsTw(char* cam_model_path, int wb_index) {
 }
 
 // Get control points
-float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) {
+float *get_ctrl_pts(char *cam_model_path, int num_cntrl_pts) {
   float *ctrl_pnts;
   int err = posix_memalign((void **)&ctrl_pnts, CACHELINE_SIZE,
                            sizeof(float) * num_cntrl_pts * 3);
@@ -200,7 +199,7 @@ float* get_ctrl_pts(char* cam_model_path, int num_cntrl_pts) {
 }
 
 // Get weights
-float* get_weights(char* cam_model_path, int num_cntrl_pts) {
+float *get_weights(char *cam_model_path, int num_cntrl_pts) {
   float *weights;
   int err = posix_memalign((void **)&weights, CACHELINE_SIZE,
                            sizeof(float) * num_cntrl_pts * 3);
@@ -245,7 +244,7 @@ float* get_weights(char* cam_model_path, int num_cntrl_pts) {
 }
 
 // Get coeficients
-float* get_coefs(char* cam_model_path, int num_cntrl_pts) {
+float *get_coefs(char *cam_model_path, int num_cntrl_pts) {
   float *coefs;
   int err = posix_memalign((void **)&coefs, CACHELINE_SIZE, sizeof(float) * 12);
   assert(err == 0 && "Failed to allocate memory!");
@@ -288,9 +287,8 @@ float* get_coefs(char* cam_model_path, int num_cntrl_pts) {
   return coefs;
 }
 
-
 // Get tone mapping table
-float* get_tone_map(char* cam_model_path) {
+float *get_tone_map(char *cam_model_path) {
   float *tone_map;
   int err = posix_memalign((void **)&tone_map, CACHELINE_SIZE,
                            sizeof(float) * 256 * CHAN_SIZE);
diff --git a/hpvm/test/hpvm-cava/src/main.c b/hpvm/test/hpvm-cava/src/main.c
index c1c0130b4c2c0ec6ec7e792c72323b03a4d508a5..8e7bd197d026773b47fd0e954b56821cd151c60a 100644
--- a/hpvm/test/hpvm-cava/src/main.c
+++ b/hpvm/test/hpvm-cava/src/main.c
@@ -1,14 +1,14 @@
+#include "utility.h"
 #include <argp.h>
+#include <assert.h>
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <assert.h>
 #include <string.h>
-#include <math.h>
-#include "utility.h"
 
 #include "cam_pipe_utility.h"
-#include "pipe_stages.h"
 #include "load_cam_model.h"
+#include "pipe_stages.h"
 
 #include "visc.h"
 
@@ -20,123 +20,135 @@ int NUM_WORKER_THREADS;
 // Type of struct holding the return value from the last node.
 struct RetStruct {
   size_t bytesRet;
-}; 
+};
 
 // Type of struct that is used to pass arguments to the HPVM dataflow graph
 // using the hpvm launch operation
 typedef struct __attribute__((__packed__)) {
-    uint8_t *input; size_t bytes_input;
-    uint8_t *result; size_t bytes_result;
-    float *input_scaled; size_t bytes_input_scaled; 
-    float *result_scaled; size_t bytes_result_scaled;
-    float *demosaic_out; size_t bytes_demosaic_out;
-    float *denoise_out; size_t bytes_denoise_out;
-    float *transform_out; size_t bytes_transform_out;
-    float *gamut_out;size_t bytes_gamut_out;
-    float *TsTw; size_t bytes_TsTw;
-    float *ctrl_pts; size_t bytes_ctrl_pts;
-    float *weights; size_t bytes_weights;
-    float*coefs; size_t bytes_coefs;
-    float *l2_dist; size_t bytes_l2_dist;
-    float *tone_map; size_t bytes_tone_map;
-    int row_size; int col_size;
-    struct RetStruct ret; // Instance of RetStruct holding the return value.
-} 
-RootIn;
+  uint8_t *input;
+  size_t bytes_input;
+  uint8_t *result;
+  size_t bytes_result;
+  float *input_scaled;
+  size_t bytes_input_scaled;
+  float *result_scaled;
+  size_t bytes_result_scaled;
+  float *demosaic_out;
+  size_t bytes_demosaic_out;
+  float *denoise_out;
+  size_t bytes_denoise_out;
+  float *transform_out;
+  size_t bytes_transform_out;
+  float *gamut_out;
+  size_t bytes_gamut_out;
+  float *TsTw;
+  size_t bytes_TsTw;
+  float *ctrl_pts;
+  size_t bytes_ctrl_pts;
+  float *weights;
+  size_t bytes_weights;
+  float *coefs;
+  size_t bytes_coefs;
+  float *l2_dist;
+  size_t bytes_l2_dist;
+  float *tone_map;
+  size_t bytes_tone_map;
+  int row_size;
+  int col_size;
+  struct RetStruct ret; // Instance of RetStruct holding the return value.
+} RootIn;
 
 typedef enum _argnum {
-    RAW_IMAGE_BIN,
-    OUTPUT_IMAGE_BIN,
-    NUM_REQUIRED_ARGS,
-    DATA_FILE = NUM_REQUIRED_ARGS,
-    NUM_ARGS,
+  RAW_IMAGE_BIN,
+  OUTPUT_IMAGE_BIN,
+  NUM_REQUIRED_ARGS,
+  DATA_FILE = NUM_REQUIRED_ARGS,
+  NUM_ARGS,
 } argnum;
 
 typedef struct _arguments {
-    char* args[NUM_ARGS];
-    int num_inputs;
-    int num_threads;
+  char *args[NUM_ARGS];
+  int num_inputs;
+  int num_threads;
 } arguments;
 
 static char prog_doc[] = "\nCamera pipeline on gem5-Aladdin.\n";
 static char args_doc[] = "path/to/raw-image-binary path/to/output-image-binary";
 static struct argp_option options[] = {
-    { "num-inputs", 'n', "N", 0, "Number of input images" }, { 0 },
-    { "data-file", 'f', "F", 0,
-      "File to read data and weights from (if data-init-mode == READ_FILE or "
-      "save-params is true). *.txt files are decoded as text files, while "
-      "*.bin files are decoded as binary files." },
+    {"num-inputs", 'n', "N", 0, "Number of input images"},
+    {0},
+    {"data-file", 'f', "F", 0,
+     "File to read data and weights from (if data-init-mode == READ_FILE or "
+     "save-params is true). *.txt files are decoded as text files, while "
+     "*.bin files are decoded as binary files."},
 };
 
-static error_t parse_opt(int key, char* arg, struct argp_state* state) {
-    arguments* args = (arguments*)(state->input);
-    switch (key) {
-        case 'n': {
-            args->num_inputs = strtol(arg, NULL, 10);
-            break;
-        }
-        case 'f': {
-            args->args[DATA_FILE] = arg;
-            break;
-        }
-        case 't': {
-            args->num_threads = strtol(arg, NULL, 10);
-            break;
-        }
-        case ARGP_KEY_ARG: {
-            if (state->arg_num >= NUM_REQUIRED_ARGS)
-                argp_usage(state);
-            args->args[state->arg_num] = arg;
-            break;
-        }
-        case ARGP_KEY_END: {
-            if (state->arg_num < NUM_REQUIRED_ARGS) {
-                fprintf(stderr,
-                        "Not enough arguments! Got %d, require %d.\n",
-                        state->arg_num,
-                        NUM_REQUIRED_ARGS);
-                argp_usage(state);
-            }
-            break;
-        }
-        default:
-            return ARGP_ERR_UNKNOWN;
+static error_t parse_opt(int key, char *arg, struct argp_state *state) {
+  arguments *args = (arguments *)(state->input);
+  switch (key) {
+  case 'n': {
+    args->num_inputs = strtol(arg, NULL, 10);
+    break;
+  }
+  case 'f': {
+    args->args[DATA_FILE] = arg;
+    break;
+  }
+  case 't': {
+    args->num_threads = strtol(arg, NULL, 10);
+    break;
+  }
+  case ARGP_KEY_ARG: {
+    if (state->arg_num >= NUM_REQUIRED_ARGS)
+      argp_usage(state);
+    args->args[state->arg_num] = arg;
+    break;
+  }
+  case ARGP_KEY_END: {
+    if (state->arg_num < NUM_REQUIRED_ARGS) {
+      fprintf(stderr, "Not enough arguments! Got %d, require %d.\n",
+              state->arg_num, NUM_REQUIRED_ARGS);
+      argp_usage(state);
     }
-    return 0;
+    break;
+  }
+  default:
+    return ARGP_ERR_UNKNOWN;
+  }
+  return 0;
 }
 
-void set_default_args(arguments* args) {
-    args->num_inputs = 1;
-    args->num_threads = 0;
-    for (int i = 0; i < NUM_ARGS; i++) {
-        args->args[i] = NULL;
-    }
+void set_default_args(arguments *args) {
+  args->num_inputs = 1;
+  args->num_threads = 0;
+  for (int i = 0; i < NUM_ARGS; i++) {
+    args->args[i] = NULL;
+  }
 }
 
-static struct argp parser = { options, parse_opt, args_doc, prog_doc };
+static struct argp parser = {options, parse_opt, args_doc, prog_doc};
 
 // Helper function for printing intermediate results
-void descale_cpu(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
-  
+void descale_cpu(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
+
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan*row_size + row) * col_size + col;
+        int index = (chan * row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
 }
 
 static void sort(float arr[], int n) {
-    int i, j;
-    for (i = 0; i < n - 1; i++)
-        for (j = 0; j < n - i - 1; j++)
-            if (arr[j] > arr[j + 1]) {
-                float temp = arr[j];
-                arr[j] = arr[j + 1];
-                arr[j + 1] = temp;
-            }
+  int i, j;
+  for (i = 0; i < n - 1; i++)
+    for (j = 0; j < n - i - 1; j++)
+      if (arr[j] > arr[j + 1]) {
+        float temp = arr[j];
+        arr[j] = arr[j + 1];
+        arr[j + 1] = temp;
+      }
 }
 
 /**************************************************************/
@@ -146,255 +158,258 @@ static void sort(float arr[], int n) {
 // In this benchmark, no use of HPVM query intrinsics in the leaf node functions
 
 // Leaf HPVM node function for scale
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               size_t row_size, size_t col_size) {
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, size_t row_size, size_t col_size) {
 
-  //Specifies compilation target for current node
+  // Specifies compilation target for current node
   __visc__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
   __visc__attributes(2, input, output, 1, output);
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+  void *thisNode = __visc__getNode();
+  int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++){
-        int index = (chan*row_size + row) * col_size + col;
-        output[index] = input[index] * 1.0 / 255;
-      }
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      output[index] = input[index] * 1.0 / 255;
+    }
   __visc__return(1, bytes_output);
 }
 
 // Leaf HPVM node function for descale
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, output, 1, output);
-  
+
   for (int chan = 0; chan < CHAN_SIZE; chan++)
     for (int row = 0; row < row_size; row++)
       for (int col = 0; col < col_size; col++) {
-        int index = (chan*row_size + row) * col_size + col;
+        int index = (chan * row_size + row) * col_size + col;
         output[index] = min(max(input[index] * 255, 0), 255);
       }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for demosaicing
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  size_t row_size, size_t col_size) {
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, result, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
-//  for (int row = 1; row < row_size - 1; row++)
-    for (int col = 1; col < col_size - 1; col++) {
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        if (row % 2 == 0 && col % 2 == 0) {
-            // Green pixel
-            // Getting the R values
-            float R1 = input[index_0 - 1];
-            float R2 = input[index_0 + 1];
-            // Getting the B values
-            float B1 = input[index_2 - col_size];
-            float B2 = input[index_2 + col_size];
-            // R
-            result[index_0] = (R1 + R2) / 2;
-            // G
-            result[index_1] = input[index_1] * 2;
-            // B
-            result[index_2] = (B1 + B2) / 2;
-        } else if (row % 2 == 0 && col % 2 == 1) {
-            // Red pixel
-            // Getting the G values
-            float G1 = input[index_1 - col_size];
-            float G2 = input[index_1 + col_size];
-            float G3 = input[index_1 - 1];
-            float G4 = input[index_1 + 1];
-            // Getting the B values
-            float B1 = input[index_2 - col_size - 1];
-            float B2 = input[index_2 - col_size + 1];
-            float B3 = input[index_2 + col_size - 1];
-            float B4 = input[index_2 + col_size + 1];
-            // R
-            result[index_0] = input[index_0];
-            // G
-            result[index_1] = (G1 + G2 + G3 + G4) / 2;
-            // B (center pixel)
-            result[index_2] = (B1 + B2 + B3 + B4) / 4;
-        } else if (row % 2 == 1 && col % 2 == 0) {
-            // Blue pixel
-            // Getting the R values
-            float R1 = input[index_0 - col_size - 1];
-            float R2 = input[index_0 + col_size - 1];
-            float R3 = input[index_0 - col_size + 1];
-            float R4 = input[index_0 + col_size + 1];
-            // Getting the G values
-            float G1 = input[index_1 - col_size];
-            float G2 = input[index_1 + col_size];
-            float G3 = input[index_1 - 1];
-            float G4 = input[index_1 + 1];
-            // R
-            result[index_0] = (R1 + R2 + R3 + R4) / 4;
-            // G
-            result[index_1] = (G1 + G2 + G3 + G4) / 2;
-            // B
-            result[index_2] = input[index_2];
-        } else {
-            // Bottom Green pixel
-            // Getting the R values
-            float R1 = input[index_0 - col_size];
-            float R2 = input[index_0 + col_size];
-            // Getting the B values
-            float B1 = input[index_2 - 1];
-            float B2 = input[index_2 + 1];
-            // R
-            result[index_0] = (R1 + R2) / 2;
-            // G
-            result[index_1] = input[index_1] * 2;
-            // B
-            result[index_2] = (B1 + B2) / 2;
-        }
-      }
+
+  void *thisNode = __visc__getNode();
+  int row = __visc__getNodeInstanceID_x(thisNode);
+  //  for (int row = 1; row < row_size - 1; row++)
+  for (int col = 1; col < col_size - 1; col++) {
+    int index_0 = (0 * row_size + row) * col_size + col;
+    int index_1 = (1 * row_size + row) * col_size + col;
+    int index_2 = (2 * row_size + row) * col_size + col;
+    if (row % 2 == 0 && col % 2 == 0) {
+      // Green pixel
+      // Getting the R values
+      float R1 = input[index_0 - 1];
+      float R2 = input[index_0 + 1];
+      // Getting the B values
+      float B1 = input[index_2 - col_size];
+      float B2 = input[index_2 + col_size];
+      // R
+      result[index_0] = (R1 + R2) / 2;
+      // G
+      result[index_1] = input[index_1] * 2;
+      // B
+      result[index_2] = (B1 + B2) / 2;
+    } else if (row % 2 == 0 && col % 2 == 1) {
+      // Red pixel
+      // Getting the G values
+      float G1 = input[index_1 - col_size];
+      float G2 = input[index_1 + col_size];
+      float G3 = input[index_1 - 1];
+      float G4 = input[index_1 + 1];
+      // Getting the B values
+      float B1 = input[index_2 - col_size - 1];
+      float B2 = input[index_2 - col_size + 1];
+      float B3 = input[index_2 + col_size - 1];
+      float B4 = input[index_2 + col_size + 1];
+      // R
+      result[index_0] = input[index_0];
+      // G
+      result[index_1] = (G1 + G2 + G3 + G4) / 2;
+      // B (center pixel)
+      result[index_2] = (B1 + B2 + B3 + B4) / 4;
+    } else if (row % 2 == 1 && col % 2 == 0) {
+      // Blue pixel
+      // Getting the R values
+      float R1 = input[index_0 - col_size - 1];
+      float R2 = input[index_0 + col_size - 1];
+      float R3 = input[index_0 - col_size + 1];
+      float R4 = input[index_0 + col_size + 1];
+      // Getting the G values
+      float G1 = input[index_1 - col_size];
+      float G2 = input[index_1 + col_size];
+      float G3 = input[index_1 - 1];
+      float G4 = input[index_1 + 1];
+      // R
+      result[index_0] = (R1 + R2 + R3 + R4) / 4;
+      // G
+      result[index_1] = (G1 + G2 + G3 + G4) / 2;
+      // B
+      result[index_2] = input[index_2];
+    } else {
+      // Bottom Green pixel
+      // Getting the R values
+      float R1 = input[index_0 - col_size];
+      float R2 = input[index_0 + col_size];
+      // Getting the B values
+      float B1 = input[index_2 - 1];
+      float B2 = input[index_2 + 1];
+      // R
+      result[index_0] = (R1 + R2) / 2;
+      // G
+      result[index_1] = input[index_1] * 2;
+      // B
+      result[index_2] = (B1 + B2) / 2;
+    }
+  }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function for denoise
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 size_t row_size, size_t col_size) {
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+
+  void *thisNode = __visc__getNode();
+  int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++)
-        if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
-          float filter[9];
-          for (int i = -1; i < 2; i++)
-            for (int j = -1; j < 2; j++) {
-              int index = ((i+row) - row + 1) * 3 + (j+col) - col + 1;
-              filter[index] = input[(chan * row_size + (i + row)) * col_size + (j + col)];
-            }
-          sort(filter, 9);
-          result[(chan * row_size + row) * col_size + col] = filter[4];
-        } else {
-      result[(chan * row_size + row) * col_size + col] = input[(chan * row_size + row) * col_size + col];
-        }
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++)
+      if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
+        float filter[9];
+        for (int i = -1; i < 2; i++)
+          for (int j = -1; j < 2; j++) {
+            int index = ((i + row) - row + 1) * 3 + (j + col) - col + 1;
+            filter[index] =
+                input[(chan * row_size + (i + row)) * col_size + (j + col)];
+          }
+        sort(filter, 9);
+        result[(chan * row_size + row) * col_size + col] = filter[4];
+      } else {
+        result[(chan * row_size + row) * col_size + col] =
+            input[(chan * row_size + row) * col_size + col];
+      }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for color map and white balance transform
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+
+  void *thisNode = __visc__getNode();
+  int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        int index_2d_0 = 0 * CHAN_SIZE + chan;
-        int index_2d_1 = 1 * CHAN_SIZE + chan;
-        int index_2d_2 = 2 * CHAN_SIZE + chan;
-        result[index] =
-            max(input[index_0] * TsTw_tran[index_2d_0] +
-                input[index_1] * TsTw_tran[index_2d_1] +
-                input[index_2] * TsTw_tran[index_2d_2],
-                0);
-      }
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      int index_0 = (0 * row_size + row) * col_size + col;
+      int index_1 = (1 * row_size + row) * col_size + col;
+      int index_2 = (2 * row_size + row) * col_size + col;
+      int index_2d_0 = 0 * CHAN_SIZE + chan;
+      int index_2d_1 = 1 * CHAN_SIZE + chan;
+      int index_2d_2 = 2 * CHAN_SIZE + chan;
+      result[index] = max(input[index_0] * TsTw_tran[index_2d_0] +
+                              input[index_1] * TsTw_tran[index_2d_1] +
+                              input[index_2] * TsTw_tran[index_2d_2],
+                          0);
+    }
   __visc__return(1, bytes_result);
 }
 
 // Leaf HPVM node function, for gamut mapping
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2, result, l2_dist);
-
- // First, get the L2 norm from every pixel to the control points,
- // Then, sum it and weight it. Finally, add the bias.
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
-//  for (int row = 0; row < row_size; row++)
-    for (int col = 0; col < col_size; col++) {
-      float chan_val_0 = 0.0;
-      float chan_val_1 = 0.0;
-      float chan_val_2 = 0.0;
-      for (int cp = 0; cp < 3702; cp++) {
-        int index_0 = (0 * row_size + row) * col_size + col;
-        int index_1 = (1 * row_size + row) * col_size + col;
-        int index_2 = (2 * row_size + row) * col_size + col;
-        float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]); 
-        float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
-        float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
-        float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]); 
-        float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]); 
-        float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
-        float val = val1 * val2 + val3 * val4 + val5 * val6;
-        float sqrt_val = sqrt(val);
-        chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
-        chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
-        chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
-      }
-        chan_val_0 += coefs[0 * CHAN_SIZE + 0] + 
-                    coefs[1 * CHAN_SIZE + 0] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 0] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
-        chan_val_1 += coefs[0 * CHAN_SIZE + 1] + 
-                    coefs[1 * CHAN_SIZE + 1] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 1] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
-        chan_val_2 += coefs[0 * CHAN_SIZE + 2] + 
-                    coefs[1 * CHAN_SIZE + 2] * input[(0 * row_size + row) * col_size + col] +
-                    coefs[2 * CHAN_SIZE + 2] * input[(1 * row_size + row) * col_size + col] +
-                    coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
-        result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
-        result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
-        result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 2,
+                     result, l2_dist);
+
+  // First, get the L2 norm from every pixel to the control points,
+  // Then, sum it and weight it. Finally, add the bias.
+  void *thisNode = __visc__getNode();
+  int row = __visc__getNodeInstanceID_x(thisNode);
+  //  for (int row = 0; row < row_size; row++)
+  for (int col = 0; col < col_size; col++) {
+    float chan_val_0 = 0.0;
+    float chan_val_1 = 0.0;
+    float chan_val_2 = 0.0;
+    for (int cp = 0; cp < 3702; cp++) {
+      int index_0 = (0 * row_size + row) * col_size + col;
+      int index_1 = (1 * row_size + row) * col_size + col;
+      int index_2 = (2 * row_size + row) * col_size + col;
+      float val1 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+      float val2 = (input[index_0] - ctrl_pts[cp * 3 + 0]);
+      float val3 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
+      float val4 = (input[index_1] - ctrl_pts[cp * 3 + 1]);
+      float val5 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+      float val6 = (input[index_2] - ctrl_pts[cp * 3 + 2]);
+      float val = val1 * val2 + val3 * val4 + val5 * val6;
+      float sqrt_val = sqrt(val);
+      chan_val_0 += sqrt_val * weights[cp * CHAN_SIZE + 0];
+      chan_val_1 += sqrt_val * weights[cp * CHAN_SIZE + 1];
+      chan_val_2 += sqrt_val * weights[cp * CHAN_SIZE + 2];
     }
+    chan_val_0 +=
+        coefs[0 * CHAN_SIZE + 0] +
+        coefs[1 * CHAN_SIZE + 0] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 0] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 0] * input[(2 * row_size + row) * col_size + col];
+    chan_val_1 +=
+        coefs[0 * CHAN_SIZE + 1] +
+        coefs[1 * CHAN_SIZE + 1] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 1] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 1] * input[(2 * row_size + row) * col_size + col];
+    chan_val_2 +=
+        coefs[0 * CHAN_SIZE + 2] +
+        coefs[1 * CHAN_SIZE + 2] *
+            input[(0 * row_size + row) * col_size + col] +
+        coefs[2 * CHAN_SIZE + 2] *
+            input[(1 * row_size + row) * col_size + col] +
+        coefs[3 * CHAN_SIZE + 2] * input[(2 * row_size + row) * col_size + col];
+    result[(0 * row_size + row) * col_size + col] = max(chan_val_0, 0);
+    result[(1 * row_size + row) * col_size + col] = max(chan_val_1, 0);
+    result[(2 * row_size + row) * col_size + col] = max(chan_val_2, 0);
+  }
   __visc__return(1, bytes_result);
 }
 
 // HPVM leaf node function, for tone mapping
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   size_t row_size, size_t col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, tone_map, 1, result);
-  
-  void* thisNode = __visc__getNode();
-	int row = __visc__getNodeInstanceID_x(thisNode);
+
+  void *thisNode = __visc__getNode();
+  int row = __visc__getNodeInstanceID_x(thisNode);
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-//    for (int row = 0; row < row_size; row++)
-      for (int col = 0; col < col_size; col++) {
-        int index = (chan * row_size + row) * col_size + col;
-        uint8_t x = input[index] * 255;
-        result[index] = tone_map[x * CHAN_SIZE + chan];
-      }
+    //    for (int row = 0; row < row_size; row++)
+    for (int col = 0; col < col_size; col++) {
+      int index = (chan * row_size + row) * col_size + col;
+      uint8_t x = input[index] * 255;
+      result[index] = tone_map[x * CHAN_SIZE + chan];
+    }
   __visc__return(1, bytes_result);
 }
 
@@ -406,9 +421,8 @@ void tone_map_fxp(float *input, size_t bytes_input,
 // requirement for the FPGA backend . The CPU backend also supports this,
 // so it does not cause a portability issue.
 
-void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
+void scale_fxp_wrapper(uint8_t *input, size_t bytes_input, float *result,
+                       size_t bytes_result, size_t row_size, size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
 
@@ -433,9 +447,9 @@ void scale_fxp_wrapper(uint8_t *input, size_t bytes_input,
   __visc__bindOut(ScaleNode, 0, 0, 0);
 }
 
-void descale_fxp_wrapper(float *input, size_t bytes_input, 
-                       uint8_t *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
+void descale_fxp_wrapper(float *input, size_t bytes_input, uint8_t *result,
+                         size_t bytes_result, size_t row_size,
+                         size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
   void *DescaleNode = __visc__createNodeND(1, descale_fxp, row_size);
@@ -445,13 +459,13 @@ void descale_fxp_wrapper(float *input, size_t bytes_input,
   __visc__bindIn(DescaleNode, 3, 3, 0); // bind bytes_result
   __visc__bindIn(DescaleNode, 4, 4, 0); // bind row_size
   __visc__bindIn(DescaleNode, 5, 5, 0); // bind col_size
-  
+
   __visc__bindOut(DescaleNode, 0, 0, 0);
 }
 
-void demosaic_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
+void demosaic_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                          size_t bytes_result, size_t row_size,
+                          size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
   void *DemosaicNode = __visc__createNodeND(1, demosaic_fxp, row_size);
@@ -461,13 +475,13 @@ void demosaic_fxp_wrapper(float *input, size_t bytes_input,
   __visc__bindIn(DemosaicNode, 3, 3, 0); // bind bytes_result
   __visc__bindIn(DemosaicNode, 4, 4, 0); // bind row_size
   __visc__bindIn(DemosaicNode, 5, 5, 0); // bind col_size
-  
+
   __visc__bindOut(DemosaicNode, 0, 0, 0);
 }
 
-void denoise_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       size_t row_size, size_t col_size) {
+void denoise_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                         size_t bytes_result, size_t row_size,
+                         size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(2, input, result, 1, result);
   void *DenoiseNode = __visc__createNodeND(1, denoise_fxp, row_size);
@@ -477,14 +491,14 @@ void denoise_fxp_wrapper(float *input, size_t bytes_input,
   __visc__bindIn(DenoiseNode, 3, 3, 0); // bind bytes_result
   __visc__bindIn(DenoiseNode, 4, 4, 0); // bind row_size
   __visc__bindIn(DenoiseNode, 5, 5, 0); // bind col_size
-  
+
   __visc__bindOut(DenoiseNode, 0, 0, 0);
 }
 
-void transform_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *TsTw_tran, size_t bytes_TsTw,
-                       size_t row_size, size_t col_size) {
+void transform_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                           size_t bytes_result, float *TsTw_tran,
+                           size_t bytes_TsTw, size_t row_size,
+                           size_t col_size) {
   __visc__hint(CPU_TARGET);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
   void *TransformNode = __visc__createNodeND(1, transform_fxp, row_size);
@@ -496,41 +510,41 @@ void transform_fxp_wrapper(float *input, size_t bytes_input,
   __visc__bindIn(TransformNode, 5, 5, 0); // bind bytes_tstw
   __visc__bindIn(TransformNode, 6, 6, 0); // bind row_size
   __visc__bindIn(TransformNode, 7, 7, 0); // bind col_size
-  
+
   __visc__bindOut(TransformNode, 0, 0, 0);
 }
 
-void gamut_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *ctrl_pts, size_t bytes_ctrl_pts,
-                       float *weights, size_t bytes_weights,
-                       float *coefs, size_t bytes_coefs,
-                       float *l2_dist, size_t bytes_l2_dist,
-                       size_t row_size, size_t col_size) {
+void gamut_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                       size_t bytes_result, float *ctrl_pts,
+                       size_t bytes_ctrl_pts, float *weights,
+                       size_t bytes_weights, float *coefs, size_t bytes_coefs,
+                       float *l2_dist, size_t bytes_l2_dist, size_t row_size,
+                       size_t col_size) {
   __visc__hint(CPU_TARGET);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
+                     result);
   void *GamutNode = __visc__createNodeND(1, gamut_map_fxp, row_size);
-  __visc__bindIn(GamutNode, 0, 0, 0); // bind input
-  __visc__bindIn(GamutNode, 1, 1, 0); // bind bytes_input
-  __visc__bindIn(GamutNode, 2, 2, 0); // bind result
-  __visc__bindIn(GamutNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(GamutNode, 4, 4, 0); // bind ctrl_pts
-  __visc__bindIn(GamutNode, 5, 5, 0); // bind bytes_ctrl_pts
-  __visc__bindIn(GamutNode, 6, 6, 0); // bind weights
-  __visc__bindIn(GamutNode, 7, 7, 0); // bind bytes_weights
-  __visc__bindIn(GamutNode, 8, 8, 0); // bind coefs
-  __visc__bindIn(GamutNode, 9, 9, 0); // bind bytes_coefs
+  __visc__bindIn(GamutNode, 0, 0, 0);   // bind input
+  __visc__bindIn(GamutNode, 1, 1, 0);   // bind bytes_input
+  __visc__bindIn(GamutNode, 2, 2, 0);   // bind result
+  __visc__bindIn(GamutNode, 3, 3, 0);   // bind bytes_result
+  __visc__bindIn(GamutNode, 4, 4, 0);   // bind ctrl_pts
+  __visc__bindIn(GamutNode, 5, 5, 0);   // bind bytes_ctrl_pts
+  __visc__bindIn(GamutNode, 6, 6, 0);   // bind weights
+  __visc__bindIn(GamutNode, 7, 7, 0);   // bind bytes_weights
+  __visc__bindIn(GamutNode, 8, 8, 0);   // bind coefs
+  __visc__bindIn(GamutNode, 9, 9, 0);   // bind bytes_coefs
   __visc__bindIn(GamutNode, 10, 10, 0); // bind l2_dist
   __visc__bindIn(GamutNode, 11, 11, 0); // bind bytes_l2_dist
   __visc__bindIn(GamutNode, 12, 12, 0); // bind row_size
   __visc__bindIn(GamutNode, 13, 13, 0); // bind col_size
-  
+
   __visc__bindOut(GamutNode, 0, 0, 0);
 }
-void tone_map_fxp_wrapper(float *input, size_t bytes_input, 
-                       float *result, size_t bytes_result,
-                       float *tone_map, size_t bytes_tone_map,
-                       size_t row_size, size_t col_size) {
+void tone_map_fxp_wrapper(float *input, size_t bytes_input, float *result,
+                          size_t bytes_result, float *tone_map,
+                          size_t bytes_tone_map, size_t row_size,
+                          size_t col_size) {
 
   __visc__hint(CPU_TARGET);
   __visc__attributes(3, input, result, tone_map, 1, result);
@@ -539,52 +553,52 @@ void tone_map_fxp_wrapper(float *input, size_t bytes_input,
   __visc__bindIn(ToneMapNode, 1, 1, 0); // bind bytes_input
   __visc__bindIn(ToneMapNode, 2, 2, 0); // bind result
   __visc__bindIn(ToneMapNode, 3, 3, 0); // bind bytes_result
-  __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map 
+  __visc__bindIn(ToneMapNode, 4, 4, 0); // bind tone_map
   __visc__bindIn(ToneMapNode, 5, 5, 0); // bind bytes_tone_map
   __visc__bindIn(ToneMapNode, 6, 6, 0); // bind row_size
   __visc__bindIn(ToneMapNode, 7, 7, 0); // bind col_size
-  
+
   __visc__bindOut(ToneMapNode, 0, 0, 0);
 }
 
-
 /*** ROOT Node - Top Level of the Graph Hierarchy ***/
-void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input, 
-                 /*2*/ uint8_t *result,        /*3*/ size_t bytes_result,
-                 /*4*/ float *input_scaled,    /*5*/ size_t bytes_input_scaled,
-                 /*6*/ float *result_scaled,   /*7*/ size_t bytes_result_scaled,
-                 /*8*/ float *demosaic_out,    /*9*/ size_t bytes_demosaic_out,
-                 /*10*/ float *denoise_out,    /*11*/ size_t bytes_denoise_out,
-                 /*12*/ float *transform_out,  /*13*/ size_t bytes_transform_out,
-                 /*14*/ float *gamut_out,      /*15*/ size_t bytes_gamut_out,
-                 /*16*/ float *TsTw,           /*17*/ size_t bytes_TsTw,
-                 /*18*/ float *ctrl_pts,       /*19*/ size_t bytes_ctrl_pts,
-                 /*20*/ float *weights,        /*21*/ size_t bytes_weights,
-                 /*22*/ float*coefs,           /*23*/ size_t bytes_coefs,
-                 /*24*/ float *l2_dist,        /*25*/ size_t bytes_l2_dist,
-                 /*26*/ float *tone_map,       /*27*/ size_t bytes_tone_map,
-                 /*28*/ size_t row_size,          /*29*/ size_t col_size) {
-
-  //Specifies compilation target for current node
-    __visc__hint(CPU_TARGET);
+void CamPipeRoot(/*0*/ uint8_t *input, /*1*/ size_t bytes_input,
+                 /*2*/ uint8_t *result, /*3*/ size_t bytes_result,
+                 /*4*/ float *input_scaled, /*5*/ size_t bytes_input_scaled,
+                 /*6*/ float *result_scaled, /*7*/ size_t bytes_result_scaled,
+                 /*8*/ float *demosaic_out, /*9*/ size_t bytes_demosaic_out,
+                 /*10*/ float *denoise_out, /*11*/ size_t bytes_denoise_out,
+                 /*12*/ float *transform_out, /*13*/ size_t bytes_transform_out,
+                 /*14*/ float *gamut_out, /*15*/ size_t bytes_gamut_out,
+                 /*16*/ float *TsTw, /*17*/ size_t bytes_TsTw,
+                 /*18*/ float *ctrl_pts, /*19*/ size_t bytes_ctrl_pts,
+                 /*20*/ float *weights, /*21*/ size_t bytes_weights,
+                 /*22*/ float *coefs, /*23*/ size_t bytes_coefs,
+                 /*24*/ float *l2_dist, /*25*/ size_t bytes_l2_dist,
+                 /*26*/ float *tone_map, /*27*/ size_t bytes_tone_map,
+                 /*28*/ size_t row_size, /*29*/ size_t col_size) {
+
+  // Specifies compilation target for current node
+  __visc__hint(CPU_TARGET);
 
   // Specifies pointer arguments that will be used as "in" and "out" arguments
   // - count of "in" arguments
   // - list of "in" argument , and similar for "out"
-    __visc__attributes(14, input, result, input_scaled, result_scaled, demosaic_out, denoise_out, 
-                       transform_out, gamut_out, TsTw, ctrl_pts, weights, coefs, tone_map, l2_dist, 
-                       5, result, demosaic_out, denoise_out, transform_out, gamut_out);
+  __visc__attributes(14, input, result, input_scaled, result_scaled,
+                     demosaic_out, denoise_out, transform_out, gamut_out, TsTw,
+                     ctrl_pts, weights, coefs, tone_map, l2_dist, 5, result,
+                     demosaic_out, denoise_out, transform_out, gamut_out);
 
   // Create an 0D (specified by 1st argument) HPVM node - so a single node
   // associated with node function ---_fxp_wrapper
-    void* ScNode = __visc__createNodeND(0, scale_fxp_wrapper);
-    void* DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper);
-    void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper);
-    void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper);
-    void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper);
-    void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper);
-    void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper);
-    
+  void *ScNode = __visc__createNodeND(0, scale_fxp_wrapper);
+  void *DmNode = __visc__createNodeND(0, demosaic_fxp_wrapper);
+  void *DnNode = __visc__createNodeND(0, denoise_fxp_wrapper);
+  void *TrNode = __visc__createNodeND(0, transform_fxp_wrapper);
+  void *GmNode = __visc__createNodeND(0, gamut_fxp_wrapper);
+  void *TnNode = __visc__createNodeND(0, tone_map_fxp_wrapper);
+  void *DsNode = __visc__createNodeND(0, descale_fxp_wrapper);
+
   // BindIn binds inputs of current node with specified node
   // - destination node
   // - argument position in argument list of function of source node
@@ -598,272 +612,283 @@ void CamPipeRoot(/*0*/ uint8_t *input,         /*1*/ size_t bytes_input,
   // - destination position (in argument list of destination node)
   // - streaming (1) or non-streaming (0)
 
-    // scale_fxp inputs
-    __visc__bindIn(ScNode, 0, 0, 0); // input -> ScNode:input
-    __visc__bindIn(ScNode, 1, 1, 0); // bytes_input -> ScNode:bytes_input
-    __visc__bindIn(ScNode, 4, 2, 0); // input_scaled -> ScNode:result
-    __visc__bindIn(ScNode, 5, 3, 0); // bytes_input_scaled -> ScNode:bytes_result
-    __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
-    __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
-
-    // demosaic_fxp inputs
-    __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
-    __visc__edge(ScNode, DmNode, 1, 0, 1, 0); // SCNode:bytes_result -> DmNode:bytes_input
-    __visc__bindIn(DmNode, 8, 2, 0); // demosaic_out -> DmNode:result
-    __visc__bindIn(DmNode, 9, 3, 0); // bytes_demosaic_out -> DmNode:bytes_result
-    __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size 
-    __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
-
-    // denoise_fxp inputs
-    __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
-    __visc__edge(DmNode, DnNode, 1, 0, 1, 0); // DMNode:bytes_result -> DnNode:bytes_input
-    __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
-    __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
-    __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size 
-    __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
-    
-    // transform_fxp inputs
-    __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
-    __visc__edge(DnNode, TrNode, 1, 0, 1, 0); // DnNode:bytes_result -> TrNode:bytes_input
-    __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
-    __visc__bindIn(TrNode, 13, 3, 0); // bytes_result_scaled -> TrNode:bytes_result
-    __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
-    __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
-    __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size 
-    __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
-    
-    // gamut_fxp inputs
-    __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
-    __visc__edge(TrNode, GmNode, 1, 0, 1, 0); // TrNode:bytes_result -> GmNode:bytes_input
-    __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
-    __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
-    __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
-    __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
-    __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
-    __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
-    __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
-    __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
-    __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
-    __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
-    __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size 
-    __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
-    
-    // tone_map_fxp inputs
-    __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
-    __visc__edge(GmNode, TnNode, 1, 0, 1, 0); // GmNode:bytes_result -> TnNode:bytes_input
-    __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
-    __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
-    __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
-    __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
-    __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size 
-    __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
-
-    // descale_fxp inputs
-    __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
-    __visc__edge(TnNode, DsNode, 1, 0, 1, 0); // TnNode:bytes_result -> DsNode:bytes_input
-    __visc__bindIn(DsNode, 2, 2, 0); // result -> DsNode:result
-    __visc__bindIn(DsNode, 3, 3, 0); // bytes_result -> DsNode:bytes_result
-    __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
-    __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
+  // scale_fxp inputs
+  __visc__bindIn(ScNode, 0, 0, 0);  // input -> ScNode:input
+  __visc__bindIn(ScNode, 1, 1, 0);  // bytes_input -> ScNode:bytes_input
+  __visc__bindIn(ScNode, 4, 2, 0);  // input_scaled -> ScNode:result
+  __visc__bindIn(ScNode, 5, 3, 0);  // bytes_input_scaled -> ScNode:bytes_result
+  __visc__bindIn(ScNode, 28, 4, 0); // row_size -> ScNode:row_size
+  __visc__bindIn(ScNode, 29, 5, 0); // col_size -> ScNode:col_size
+
+  // demosaic_fxp inputs
+  __visc__bindIn(DmNode, 4, 0, 0); // input_scaled -> DmNode:input
+  __visc__edge(ScNode, DmNode, 1, 0, 1,
+               0);                  // SCNode:bytes_result -> DmNode:bytes_input
+  __visc__bindIn(DmNode, 8, 2, 0);  // demosaic_out -> DmNode:result
+  __visc__bindIn(DmNode, 9, 3, 0);  // bytes_demosaic_out -> DmNode:bytes_result
+  __visc__bindIn(DmNode, 28, 4, 0); // row_size -> DmNode:row_size
+  __visc__bindIn(DmNode, 29, 5, 0); // col_size -> DmNode:col_size
+
+  // denoise_fxp inputs
+  __visc__bindIn(DnNode, 8, 0, 0); // demosaic_out -> DnNode:input
+  __visc__edge(DmNode, DnNode, 1, 0, 1,
+               0);                  // DMNode:bytes_result -> DnNode:bytes_input
+  __visc__bindIn(DnNode, 10, 2, 0); // denoise_out -> DnNode:result
+  __visc__bindIn(DnNode, 11, 3, 0); // bytes_denoise_out -> DnNode:bytes_result
+  __visc__bindIn(DnNode, 28, 4, 0); // row_size -> DnNode:row_size
+  __visc__bindIn(DnNode, 29, 5, 0); // col_size -> DnNode:col_size
+
+  // transform_fxp inputs
+  __visc__bindIn(TrNode, 10, 0, 0); // denoise_out -> TrNode:input
+  __visc__edge(DnNode, TrNode, 1, 0, 1,
+               0);                  // DnNode:bytes_result -> TrNode:bytes_input
+  __visc__bindIn(TrNode, 12, 2, 0); // transform_out -> TrNode:result
+  __visc__bindIn(TrNode, 13, 3,
+                 0); // bytes_result_scaled -> TrNode:bytes_result
+  __visc__bindIn(TrNode, 16, 4, 0); // TsTw -> TrNode:TsTw_trann
+  __visc__bindIn(TrNode, 17, 5, 0); // bytes_TsTw -> TrNode:bytes_TsTw
+  __visc__bindIn(TrNode, 28, 6, 0); // row_size -> TrNode:row_size
+  __visc__bindIn(TrNode, 29, 7, 0); // col_size -> TrNode:col_size
+
+  // gamut_fxp inputs
+  __visc__bindIn(GmNode, 12, 0, 0); // transform_out -> GmNode:input
+  __visc__edge(TrNode, GmNode, 1, 0, 1,
+               0);                  // TrNode:bytes_result -> GmNode:bytes_input
+  __visc__bindIn(GmNode, 14, 2, 0); // gamut_out -> GmNode:result
+  __visc__bindIn(GmNode, 15, 3, 0); // bytes_gamut_out -> GmNode:bytes_result
+  __visc__bindIn(GmNode, 18, 4, 0); // ctrl_pts -> GmNode:ctrl_pts
+  __visc__bindIn(GmNode, 19, 5, 0); // bytes_ctrl_pts -> GmNode:bytes_ctrl_pts
+  __visc__bindIn(GmNode, 20, 6, 0); // weights -> GmNode:weights
+  __visc__bindIn(GmNode, 21, 7, 0); // bytes_weights -> GmNode:bytes_weights
+  __visc__bindIn(GmNode, 22, 8, 0); // coefs -> GmNode:coefs
+  __visc__bindIn(GmNode, 23, 9, 0); // bytes_coefs -> GmNode:bytes_coefs
+  __visc__bindIn(GmNode, 24, 10, 0); // l2_dist -> GmNode: l2_dist
+  __visc__bindIn(GmNode, 25, 11, 0); // bytes_l2_dist -> GmNode:bytes_l2_dist
+  __visc__bindIn(GmNode, 28, 12, 0); // row_size -> GmNode:row_size
+  __visc__bindIn(GmNode, 29, 13, 0); // col_size -> GmNode:col_size
+
+  // tone_map_fxp inputs
+  __visc__bindIn(TnNode, 14, 0, 0); // gamut_out -> TnNode:input
+  __visc__edge(GmNode, TnNode, 1, 0, 1,
+               0);                 // GmNode:bytes_result -> TnNode:bytes_input
+  __visc__bindIn(TnNode, 6, 2, 0); // result_scaled -> TnNode:result
+  __visc__bindIn(TnNode, 7, 3, 0); // bytes_result_scaled -> TnNode:bytes_result
+  __visc__bindIn(TnNode, 26, 4, 0); // tone_map -> TnNode:tone_map
+  __visc__bindIn(TnNode, 27, 5, 0); // bytes_tone_map -> TnNode:bytes_tone_map
+  __visc__bindIn(TnNode, 28, 6, 0); // row_size -> TnNode:row_size
+  __visc__bindIn(TnNode, 29, 7, 0); // col_size -> TnNode:col_size
+
+  // descale_fxp inputs
+  __visc__bindIn(DsNode, 6, 0, 0); // result_scaled -> DsNode:input
+  __visc__edge(TnNode, DsNode, 1, 0, 1,
+               0);                  // TnNode:bytes_result -> DsNode:bytes_input
+  __visc__bindIn(DsNode, 2, 2, 0);  // result -> DsNode:result
+  __visc__bindIn(DsNode, 3, 3, 0);  // bytes_result -> DsNode:bytes_result
+  __visc__bindIn(DsNode, 28, 4, 0); // row_size -> DsNode:row_size
+  __visc__bindIn(DsNode, 29, 5, 0); // col_size -> DsNode:col_size
 
   // Similar to bindIn, but for the output. Output of a node is a struct, and
   // we consider the fields in increasing ordering.
-    __visc__bindOut(DsNode, 0, 0, 0);
-    
+  __visc__bindOut(DsNode, 0, 0, 0);
 }
 
-int main(int argc, char* argv[]) {
-    // Parse the arguments.
-    arguments args;
-    set_default_args(&args);
-    argp_parse(&parser, argc, argv, 0, 0, &args);
-
-    // Read a raw image.
-    // NOTE: We deliberately perform this file I/O outside of the kernel.
-    printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
-    size_t row_size, col_size;
-    uint8_t *image_in = read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
-
-    printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
-
-    // Allocate a buffer for storing the output image data.
-    // (This is currently the same size as the input image data.)
-    size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
-    size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
-    uint8_t *image_out = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_gamut = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_demosaic = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_denoise = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t *image_out_transform = (uint8_t*) malloc_aligned(bytes_image);
-
-    __visc__init();
-
-    ///////////////////////////////////////////////////////////////
-    // Camera Model Parameters
-    ///////////////////////////////////////////////////////////////
-    // Path to the camera model to be used
-//    char cam_model_path[100];
-//    char cam_model_path = "cam_models/NikonD7000/";
-    // White balance index (select white balance from transform file)
-    // The first white balance in the file has a wb_index of 1
-    // For more information on model format see the readme
-    int wb_index = 6;
-
-    // Number of control points
-    int num_ctrl_pts = 3702;
-    uint8_t *input, *result;
-    float *input_scaled, *result_scaled, *demosaic_out, *denoise_out, *transform_out, *gamut_out;
-    float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
-
-    TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
-    float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
-    free(TsTw);
-    TsTw = trans;
-    ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
-    weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
-    coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
-    tone_map = get_tone_map("cam_models/NikonD7000/");
-    
-    input_scaled = (float*) malloc_aligned(bytes_fimage);
-    result_scaled = (float*) malloc_aligned(bytes_fimage);
-    demosaic_out = (float*) malloc_aligned(bytes_fimage);
-    denoise_out = (float*) malloc_aligned(bytes_fimage);
-    transform_out  = (float*) malloc_aligned(bytes_fimage);
-    gamut_out = (float*) malloc_aligned(bytes_fimage);
-    l2_dist = (float*) malloc_aligned(sizeof(float) * num_ctrl_pts);    
-    
-    // This is host_input in cam_pipe()
-    input = (uint8_t*) malloc_aligned(bytes_image);
-    convert_hwc_to_chw(image_in, row_size, col_size, &input);
-    
-    // This is host_result in cam_pipe()
-    result = (uint8_t*) malloc_aligned(bytes_image);
-
-    // Allocate struct to pass DFG inputs
-    RootIn* rootArgs = (RootIn*) malloc(sizeof(RootIn));
-
-    // Set up HPVM DFG inputs in the rootArgs struct.
-    rootArgs->input = input;
-    rootArgs->bytes_input = bytes_image;
-    
-    rootArgs->result = result;
-    rootArgs->bytes_result = bytes_image;
-    
-    rootArgs->input_scaled = input_scaled;
-    rootArgs->bytes_input_scaled = bytes_fimage;
-    
-    rootArgs->result_scaled = result_scaled;
-    rootArgs->bytes_result_scaled = bytes_fimage;
-    
-    rootArgs->demosaic_out = demosaic_out;
-    rootArgs->bytes_demosaic_out = bytes_fimage;
-    
-    rootArgs->denoise_out = denoise_out;
-    rootArgs->bytes_denoise_out = bytes_fimage;
-    
-    rootArgs->transform_out = transform_out;
-    rootArgs->bytes_transform_out = bytes_fimage;
-
-    rootArgs->gamut_out = gamut_out;
-    rootArgs->bytes_gamut_out = bytes_fimage;
-
-    rootArgs->TsTw = TsTw;
-    rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->ctrl_pts = ctrl_pts;
-    rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->weights = weights;
-    rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->coefs = coefs;
-    rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->tone_map = tone_map;
-    rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
-    
-    rootArgs->l2_dist = l2_dist;
-    rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
-    
-    rootArgs->row_size = row_size;
-    rootArgs->col_size = col_size;
-
-    // Memory tracking is required for pointer arguments.
-    // Nodes can be scheduled on different targets, and 
-    // dataflow edge implementation needs to request data.
-    // The pair (pointer, size) is inserted in memory tracker using this call
-    llvm_visc_track_mem(input, bytes_image);
-    llvm_visc_track_mem(result, bytes_image);
-    llvm_visc_track_mem(input_scaled, bytes_fimage);
-    llvm_visc_track_mem(result_scaled, bytes_fimage);
-    llvm_visc_track_mem(demosaic_out, bytes_fimage);
-    llvm_visc_track_mem(denoise_out, bytes_fimage);
-    llvm_visc_track_mem(transform_out, bytes_fimage);
-    llvm_visc_track_mem(gamut_out, bytes_fimage);
-    llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float)); 
-    llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(coefs, 4 * CHAN_SIZE *sizeof(float));
-    llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
-    llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
-    
-    printf("\n\nLaunching CAVA pipeline!\n");
-
-    void* camPipeDFG = __visc__launch(0, CamPipeRoot, (void*) rootArgs);
-    __visc__wait(camPipeDFG);
-
-    printf("\n\nPipeline execution completed!\n");
-    printf(
-      "Pipeline final stage returned %lu; should be %lu\n",
-      rootArgs->ret.bytesRet, bytes_image
-    );
-    printf("\n\nRequesting memory!\n");
-
-    // Request data from graph.    
-    llvm_visc_request_mem(result, bytes_image);
-    llvm_visc_request_mem(demosaic_out, bytes_fimage);
-    llvm_visc_request_mem(denoise_out, bytes_fimage);
-    llvm_visc_request_mem(transform_out, bytes_fimage);
-    llvm_visc_request_mem(gamut_out, bytes_fimage);
-    printf("\n\nDone requesting memory!\n");
-
-
-    uint8_t* gamut_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-  uint8_t* demosaic_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t* transform_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    uint8_t* denoise_out_descaled = (uint8_t*) malloc_aligned(bytes_image);
-    
-  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image, row_size, col_size);
-    descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image, row_size, col_size);
-    
-    convert_chw_to_hwc(result, row_size, col_size, &image_out);
-   convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
-    convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size, &image_out_demosaic);
-    convert_chw_to_hwc(denoise_out_descaled, row_size, col_size, &image_out_denoise);
-    convert_chw_to_hwc(transform_out_descaled, row_size, col_size, &image_out_transform);
-
-    
-    // Remove tracked pointers.
-    llvm_visc_untrack_mem(input);
-    llvm_visc_untrack_mem(result);
-    llvm_visc_untrack_mem(input_scaled);
-    llvm_visc_untrack_mem(result_scaled);
-    llvm_visc_untrack_mem(demosaic_out);
-    llvm_visc_untrack_mem(denoise_out);
-    llvm_visc_untrack_mem(transform_out);
-    llvm_visc_untrack_mem(gamut_out);
-    
-    llvm_visc_untrack_mem(TsTw); 
-    llvm_visc_untrack_mem(ctrl_pts);
-    llvm_visc_untrack_mem(weights);
-    llvm_visc_untrack_mem(coefs);
-    llvm_visc_untrack_mem(tone_map);
-    llvm_visc_untrack_mem(l2_dist);
-
-    // Output the image.
-    // NOTE: We deliberately perform this file I/O outside of the kernel.
+int main(int argc, char *argv[]) {
+  // Parse the arguments.
+  arguments args;
+  set_default_args(&args);
+  argp_parse(&parser, argc, argv, 0, 0, &args);
+
+  // Read a raw image.
+  // NOTE: We deliberately perform this file I/O outside of the kernel.
+  printf("Reading a raw image from %s\n", args.args[RAW_IMAGE_BIN]);
+  size_t row_size, col_size;
+  uint8_t *image_in =
+      read_image_from_binary(args.args[RAW_IMAGE_BIN], &row_size, &col_size);
+
+  printf("Raw image shape: %d x %d x %d\n", row_size, col_size, CHAN_SIZE);
+
+  // Allocate a buffer for storing the output image data.
+  // (This is currently the same size as the input image data.)
+  size_t bytes_image = sizeof(uint8_t) * row_size * col_size * CHAN_SIZE;
+  size_t bytes_fimage = sizeof(float) * row_size * col_size * CHAN_SIZE;
+  uint8_t *image_out = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_gamut = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_demosaic = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_denoise = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *image_out_transform = (uint8_t *)malloc_aligned(bytes_image);
+
+  __visc__init();
+
+  ///////////////////////////////////////////////////////////////
+  // Camera Model Parameters
+  ///////////////////////////////////////////////////////////////
+  // Path to the camera model to be used
+  //    char cam_model_path[100];
+  //    char cam_model_path = "cam_models/NikonD7000/";
+  // White balance index (select white balance from transform file)
+  // The first white balance in the file has a wb_index of 1
+  // For more information on model format see the readme
+  int wb_index = 6;
+
+  // Number of control points
+  int num_ctrl_pts = 3702;
+  uint8_t *input, *result;
+  float *input_scaled, *result_scaled, *demosaic_out, *denoise_out,
+      *transform_out, *gamut_out;
+  float *TsTw, *ctrl_pts, *weights, *coefs, *tone_map, *l2_dist;
+
+  TsTw = get_TsTw("cam_models/NikonD7000/", wb_index);
+  float *trans = transpose_mat(TsTw, CHAN_SIZE, CHAN_SIZE);
+  free(TsTw);
+  TsTw = trans;
+  ctrl_pts = get_ctrl_pts("cam_models/NikonD7000/", num_ctrl_pts);
+  weights = get_weights("cam_models/NikonD7000/", num_ctrl_pts);
+  coefs = get_coefs("cam_models/NikonD7000/", num_ctrl_pts);
+  tone_map = get_tone_map("cam_models/NikonD7000/");
+
+  input_scaled = (float *)malloc_aligned(bytes_fimage);
+  result_scaled = (float *)malloc_aligned(bytes_fimage);
+  demosaic_out = (float *)malloc_aligned(bytes_fimage);
+  denoise_out = (float *)malloc_aligned(bytes_fimage);
+  transform_out = (float *)malloc_aligned(bytes_fimage);
+  gamut_out = (float *)malloc_aligned(bytes_fimage);
+  l2_dist = (float *)malloc_aligned(sizeof(float) * num_ctrl_pts);
+
+  // This is host_input in cam_pipe()
+  input = (uint8_t *)malloc_aligned(bytes_image);
+  convert_hwc_to_chw(image_in, row_size, col_size, &input);
+
+  // This is host_result in cam_pipe()
+  result = (uint8_t *)malloc_aligned(bytes_image);
+
+  // Allocate struct to pass DFG inputs
+  RootIn *rootArgs = (RootIn *)malloc(sizeof(RootIn));
+
+  // Set up HPVM DFG inputs in the rootArgs struct.
+  rootArgs->input = input;
+  rootArgs->bytes_input = bytes_image;
+
+  rootArgs->result = result;
+  rootArgs->bytes_result = bytes_image;
+
+  rootArgs->input_scaled = input_scaled;
+  rootArgs->bytes_input_scaled = bytes_fimage;
+
+  rootArgs->result_scaled = result_scaled;
+  rootArgs->bytes_result_scaled = bytes_fimage;
+
+  rootArgs->demosaic_out = demosaic_out;
+  rootArgs->bytes_demosaic_out = bytes_fimage;
+
+  rootArgs->denoise_out = denoise_out;
+  rootArgs->bytes_denoise_out = bytes_fimage;
+
+  rootArgs->transform_out = transform_out;
+  rootArgs->bytes_transform_out = bytes_fimage;
+
+  rootArgs->gamut_out = gamut_out;
+  rootArgs->bytes_gamut_out = bytes_fimage;
+
+  rootArgs->TsTw = TsTw;
+  rootArgs->bytes_TsTw = CHAN_SIZE * CHAN_SIZE * sizeof(float);
+
+  rootArgs->ctrl_pts = ctrl_pts;
+  rootArgs->bytes_ctrl_pts = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+
+  rootArgs->weights = weights;
+  rootArgs->bytes_weights = num_ctrl_pts * CHAN_SIZE * sizeof(float);
+
+  rootArgs->coefs = coefs;
+  rootArgs->bytes_coefs = 4 * CHAN_SIZE * sizeof(float);
+
+  rootArgs->tone_map = tone_map;
+  rootArgs->bytes_tone_map = 256 * CHAN_SIZE * sizeof(float);
+
+  rootArgs->l2_dist = l2_dist;
+  rootArgs->bytes_l2_dist = num_ctrl_pts * sizeof(float);
+
+  rootArgs->row_size = row_size;
+  rootArgs->col_size = col_size;
+
+  // Memory tracking is required for pointer arguments.
+  // Nodes can be scheduled on different targets, and
+  // dataflow edge implementation needs to request data.
+  // The pair (pointer, size) is inserted in memory tracker using this call
+  llvm_visc_track_mem(input, bytes_image);
+  llvm_visc_track_mem(result, bytes_image);
+  llvm_visc_track_mem(input_scaled, bytes_fimage);
+  llvm_visc_track_mem(result_scaled, bytes_fimage);
+  llvm_visc_track_mem(demosaic_out, bytes_fimage);
+  llvm_visc_track_mem(denoise_out, bytes_fimage);
+  llvm_visc_track_mem(transform_out, bytes_fimage);
+  llvm_visc_track_mem(gamut_out, bytes_fimage);
+  llvm_visc_track_mem(TsTw, CHAN_SIZE * CHAN_SIZE * sizeof(float));
+  llvm_visc_track_mem(ctrl_pts, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  llvm_visc_track_mem(weights, num_ctrl_pts * CHAN_SIZE * sizeof(float));
+  llvm_visc_track_mem(coefs, 4 * CHAN_SIZE * sizeof(float));
+  llvm_visc_track_mem(tone_map, 256 * CHAN_SIZE * sizeof(float));
+  llvm_visc_track_mem(l2_dist, num_ctrl_pts * sizeof(float));
+
+  printf("\n\nLaunching CAVA pipeline!\n");
+
+  void *camPipeDFG = __visc__launch(0, CamPipeRoot, (void *)rootArgs);
+  __visc__wait(camPipeDFG);
+
+  printf("\n\nPipeline execution completed!\n");
+  printf("Pipeline final stage returned %lu; should be %lu\n",
+         rootArgs->ret.bytesRet, bytes_image);
+  printf("\n\nRequesting memory!\n");
+
+  // Request data from graph.
+  llvm_visc_request_mem(result, bytes_image);
+  llvm_visc_request_mem(demosaic_out, bytes_fimage);
+  llvm_visc_request_mem(denoise_out, bytes_fimage);
+  llvm_visc_request_mem(transform_out, bytes_fimage);
+  llvm_visc_request_mem(gamut_out, bytes_fimage);
+  printf("\n\nDone requesting memory!\n");
+
+  uint8_t *gamut_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *demosaic_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *transform_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+  uint8_t *denoise_out_descaled = (uint8_t *)malloc_aligned(bytes_image);
+
+  descale_cpu(demosaic_out, bytes_fimage, demosaic_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(gamut_out, bytes_fimage, gamut_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(denoise_out, bytes_fimage, denoise_out_descaled, bytes_image,
+              row_size, col_size);
+  descale_cpu(transform_out, bytes_fimage, transform_out_descaled, bytes_image,
+              row_size, col_size);
+
+  convert_chw_to_hwc(result, row_size, col_size, &image_out);
+  convert_chw_to_hwc(gamut_out_descaled, row_size, col_size, &image_out_gamut);
+  convert_chw_to_hwc(demosaic_out_descaled, row_size, col_size,
+                     &image_out_demosaic);
+  convert_chw_to_hwc(denoise_out_descaled, row_size, col_size,
+                     &image_out_denoise);
+  convert_chw_to_hwc(transform_out_descaled, row_size, col_size,
+                     &image_out_transform);
+
+  // Remove tracked pointers.
+  llvm_visc_untrack_mem(input);
+  llvm_visc_untrack_mem(result);
+  llvm_visc_untrack_mem(input_scaled);
+  llvm_visc_untrack_mem(result_scaled);
+  llvm_visc_untrack_mem(demosaic_out);
+  llvm_visc_untrack_mem(denoise_out);
+  llvm_visc_untrack_mem(transform_out);
+  llvm_visc_untrack_mem(gamut_out);
+
+  llvm_visc_untrack_mem(TsTw);
+  llvm_visc_untrack_mem(ctrl_pts);
+  llvm_visc_untrack_mem(weights);
+  llvm_visc_untrack_mem(coefs);
+  llvm_visc_untrack_mem(tone_map);
+  llvm_visc_untrack_mem(l2_dist);
+
+  // Output the image.
+  // NOTE: We deliberately perform this file I/O outside of the kernel.
   char str[50], base_str[50];
   strcpy(base_str, args.args[OUTPUT_IMAGE_BIN]);
   strcpy(str, base_str);
@@ -887,8 +912,7 @@ int main(int argc, char* argv[]) {
   printf("Writing output image to %s\n", str);
   write_image_to_binary(str, image_out_transform, row_size, col_size);
 
-    __visc__cleanup();
+  __visc__cleanup();
 
-    return 0;
+  return 0;
 }
-
diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.c b/hpvm/test/hpvm-cava/src/pipe_stages.c
index 2ebedec936915b5e7f11881c5001c84b6db26474..253052af872838f6ed363e3497ef64dd288db84e 100644
--- a/hpvm/test/hpvm-cava/src/pipe_stages.c
+++ b/hpvm/test/hpvm-cava/src/pipe_stages.c
@@ -1,44 +1,43 @@
-#include <stdio.h>
-#include <math.h>
 #include "pipe_stages.h"
 #include "cam_pipe_utility.h"
+#include <math.h>
+#include <stdio.h>
 
-//void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               int row_size, int col_size) {
+// void scale_fxp(uint8_t *input, int row_size, int col_size, float *output) {
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, output, 1, output);
-  
+
   ARRAY_3D(uint8_t, _input, input, row_size, col_size);
   ARRAY_3D(float, _output, output, row_size, col_size);
-  sl_chan:
+sl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    sl_row:
+  sl_row:
     for (int row = 0; row < row_size; row++)
-      sl_col:
+    sl_col:
       for (int col = 0; col < col_size; col++)
         _output[chan][row][col] = _input[chan][row][col] * 1.0 / 255;
 
   __visc__return(1, bytes_output);
 }
 
-//void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 int row_size, int col_size) {
+// void descale_fxp(float *input, int row_size, int col_size, uint8_t *output) {
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, output, 1, output);
-  
+
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(uint8_t, _output, output, row_size, col_size);
-  dsl_chan:
+dsl_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    dsl_row:
+  dsl_row:
     for (int row = 0; row < row_size; row++)
-      dsl_col:
+    dsl_col:
       for (int col = 0; col < col_size; col++)
-        _output[chan][row][col] = min(max(_input[chan][row][col] * 255, 0), 255);
+        _output[chan][row][col] =
+            min(max(_input[chan][row][col] * 255, 0), 255);
 
   __visc__return(1, bytes_output);
 }
@@ -46,127 +45,125 @@ void descale_fxp(float *input, size_t bytes_input,
 // Demosaicing stage
 // G R
 // B G
-//void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  int row_size, int col_size) {
+// void demosaic_fxp(float *input, int row_size, int col_size, float *result) {
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, result, 1, result);
-  
+
   printf("Demosaicing.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-  dm_row:
+dm_row:
   for (int row = 1; row < row_size - 1; row++)
-    dm_col:
+  dm_col:
     for (int col = 1; col < col_size - 1; col++)
-        if (row % 2 == 0 && col % 2 == 0) {
-            // Green pixel
-            // Getting the R values
-            float R1 = _input[0][row][col - 1];
-            float R2 = _input[0][row][col + 1];
-            // Getting the B values
-            float B1 = _input[2][row - 1][col];
-            float B2 = _input[2][row + 1][col];
-            // R
-            _result[0][row][col] = (R1 + R2) / 2;
-            // G
-            _result[1][row][col] = _input[1][row][col] * 2;
-            // B
-            _result[2][row][col] = (B1 + B2) / 2;
-        } else if (row % 2 == 0 && col % 2 == 1) {
-            // Red pixel
-            // Getting the G values
-            float G1 = _input[1][row - 1][col];
-            float G2 = _input[1][row + 1][col];
-            float G3 = _input[1][row][col - 1];
-            float G4 = _input[1][row][col + 1];
-            // Getting the B values
-            float B1 = _input[2][row - 1][col - 1];
-            float B2 = _input[2][row - 1][col + 1];
-            float B3 = _input[2][row + 1][col - 1];
-            float B4 = _input[2][row + 1][col + 1];
-            // R
-            _result[0][row][col] = _input[0][row][col];
-            // G
-            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-            // B (center pixel)
-            _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
-        } else if (row % 2 == 1 && col % 2 == 0) {
-            // Blue pixel
-            // Getting the R values
-            float R1 = _input[0][row - 1][col - 1];
-            float R2 = _input[0][row + 1][col - 1];
-            float R3 = _input[0][row - 1][col + 1];
-            float R4 = _input[0][row + 1][col + 1];
-            // Getting the G values
-            float G1 = _input[1][row - 1][col];
-            float G2 = _input[1][row + 1][col];
-            float G3 = _input[1][row][col - 1];
-            float G4 = _input[1][row][col + 1];
-            // R
-            _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
-            // G
-            _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
-            // B
-            _result[2][row][col] = _input[2][row][col];
-        } else {
-            // Bottom Green pixel
-            // Getting the R values
-            float R1 = _input[0][row - 1][col];
-            float R2 = _input[0][row + 1][col];
-            // Getting the B values
-            float B1 = _input[2][row][col - 1];
-            float B2 = _input[2][row][col + 1];
-            // R
-            _result[0][row][col] = (R1 + R2) / 2;
-            // G
-            _result[1][row][col] = _input[1][row][col] * 2;
-            // B
-            _result[2][row][col] = (B1 + B2) / 2;
-        }
+      if (row % 2 == 0 && col % 2 == 0) {
+        // Green pixel
+        // Getting the R values
+        float R1 = _input[0][row][col - 1];
+        float R2 = _input[0][row][col + 1];
+        // Getting the B values
+        float B1 = _input[2][row - 1][col];
+        float B2 = _input[2][row + 1][col];
+        // R
+        _result[0][row][col] = (R1 + R2) / 2;
+        // G
+        _result[1][row][col] = _input[1][row][col] * 2;
+        // B
+        _result[2][row][col] = (B1 + B2) / 2;
+      } else if (row % 2 == 0 && col % 2 == 1) {
+        // Red pixel
+        // Getting the G values
+        float G1 = _input[1][row - 1][col];
+        float G2 = _input[1][row + 1][col];
+        float G3 = _input[1][row][col - 1];
+        float G4 = _input[1][row][col + 1];
+        // Getting the B values
+        float B1 = _input[2][row - 1][col - 1];
+        float B2 = _input[2][row - 1][col + 1];
+        float B3 = _input[2][row + 1][col - 1];
+        float B4 = _input[2][row + 1][col + 1];
+        // R
+        _result[0][row][col] = _input[0][row][col];
+        // G
+        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+        // B (center pixel)
+        _result[2][row][col] = (B1 + B2 + B3 + B4) / 4;
+      } else if (row % 2 == 1 && col % 2 == 0) {
+        // Blue pixel
+        // Getting the R values
+        float R1 = _input[0][row - 1][col - 1];
+        float R2 = _input[0][row + 1][col - 1];
+        float R3 = _input[0][row - 1][col + 1];
+        float R4 = _input[0][row + 1][col + 1];
+        // Getting the G values
+        float G1 = _input[1][row - 1][col];
+        float G2 = _input[1][row + 1][col];
+        float G3 = _input[1][row][col - 1];
+        float G4 = _input[1][row][col + 1];
+        // R
+        _result[0][row][col] = (R1 + R2 + R3 + R4) / 4;
+        // G
+        _result[1][row][col] = (G1 + G2 + G3 + G4) / 2;
+        // B
+        _result[2][row][col] = _input[2][row][col];
+      } else {
+        // Bottom Green pixel
+        // Getting the R values
+        float R1 = _input[0][row - 1][col];
+        float R2 = _input[0][row + 1][col];
+        // Getting the B values
+        float B1 = _input[2][row][col - 1];
+        float B2 = _input[2][row][col + 1];
+        // R
+        _result[0][row][col] = (R1 + R2) / 2;
+        // G
+        _result[1][row][col] = _input[1][row][col] * 2;
+        // B
+        _result[2][row][col] = (B1 + B2) / 2;
+      }
 
   __visc__return(1, bytes_result);
 }
 
 static void sort(float arr[], int n) {
-    int i, j;
-    dn_sort_i:
-    for (i = 0; i < n - 1; i++)
-        dn_sort_j:
-        for (j = 0; j < n - i - 1; j++)
-            if (arr[j] > arr[j + 1]) {
-                float temp = arr[j];
-                arr[j] = arr[j + 1];
-                arr[j + 1] = temp;
-            }
+  int i, j;
+dn_sort_i:
+  for (i = 0; i < n - 1; i++)
+  dn_sort_j:
+    for (j = 0; j < n - i - 1; j++)
+      if (arr[j] > arr[j + 1]) {
+        float temp = arr[j];
+        arr[j] = arr[j + 1];
+        arr[j + 1] = temp;
+      }
 }
 
 // Simple denoise
-//void denoise_fxp(float *input, int row_size, int col_size, float *result) {
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 int row_size, int col_size) {
+// void denoise_fxp(float *input, int row_size, int col_size, float *result) {
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(2, input, result, 1, result);
-  
+
   printf("Denoising.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
 
-  dn_chan:
+dn_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    dn_row:
+  dn_row:
     for (int row = 0; row < row_size; row++)
-      dn_col:
+    dn_col:
       for (int col = 0; col < col_size; col++)
         if (row >= 1 && row < row_size - 1 && col >= 1 && col < col_size - 1) {
           float filter[9];
-          dn_slide_row:
-          for (int i = row-1; i < row+2; i++)
-            dn_slide_col:
-            for (int j = col-1; j < col+2; j++) {
+        dn_slide_row:
+          for (int i = row - 1; i < row + 2; i++)
+          dn_slide_col:
+            for (int j = col - 1; j < col + 2; j++) {
               int index = (i - row + 1) * 3 + j - col + 1;
               filter[index] = _input[chan][i][j];
             }
@@ -179,25 +176,24 @@ void denoise_fxp(float *input, size_t bytes_input,
 }
 
 // Color map and white balance transform
-//void transform_fxp(float *input, int row_size, int col_size, float *result,
+// void transform_fxp(float *input, int row_size, int col_size, float *result,
 //                   float *TsTw_tran) {
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, TsTw_tran, 1, result);
-  
+
   printf("Color mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _TsTw_tran, TsTw_tran, 3);
 
-  tr_chan:
+tr_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    tr_row:
+  tr_row:
     for (int row = 0; row < row_size; row++)
-      tr_col:
+    tr_col:
       for (int col = 0; col < col_size; col++)
         _result[chan][row][col] =
             max(_input[0][row][col] * _TsTw_tran[0][chan] +
@@ -210,18 +206,18 @@ void transform_fxp(float *input, size_t bytes_input,
 //
 // Weighted radial basis function for gamut mapping
 //
-//void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
-//                   float *ctrl_pts, float *weights, float *coefs, float *l2_dist) {
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+// void gamut_map_fxp(float *input, int row_size, int col_size, float *result,
+//                   float *ctrl_pts, float *weights, float *coefs, float
+//                   *l2_dist) {
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    int row_size, int col_size) {
   __visc__hint(DEVICE);
-  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1, result);
-  
+  __visc__attributes(6, input, result, ctrl_pts, weights, coefs, l2_dist, 1,
+                     result);
+
   printf("Gamut mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
@@ -229,26 +225,25 @@ void gamut_map_fxp(float *input, size_t bytes_input,
   ARRAY_2D(float, _weights, weights, 3);
   ARRAY_2D(float, _coefs, coefs, 3);
 
-  // First, get the L2 norm from every pixel to the control points,
-  // Then, sum it and weight it. Finally, add the bias.
-  gm_rbf_row:
+// First, get the L2 norm from every pixel to the control points,
+// Then, sum it and weight it. Finally, add the bias.
+gm_rbf_row:
   for (int row = 0; row < row_size; row++)
-    gm_rbf_col:
+  gm_rbf_col:
     for (int col = 0; col < col_size; col++) {
-      gm_rbf_cp0:
+    gm_rbf_cp0:
       for (int cp = 0; cp < num_ctrl_pts; cp++) {
-        l2_dist[cp] =
-            sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
-                     (_input[0][row][col] - _ctrl_pts[cp][0]) +
-                 (_input[1][row][col] - _ctrl_pts[cp][1]) *
-                     (_input[1][row][col] - _ctrl_pts[cp][1]) +
-                 (_input[2][row][col] - _ctrl_pts[cp][2]) *
-                     (_input[2][row][col] - _ctrl_pts[cp][2]));
+        l2_dist[cp] = sqrt((_input[0][row][col] - _ctrl_pts[cp][0]) *
+                               (_input[0][row][col] - _ctrl_pts[cp][0]) +
+                           (_input[1][row][col] - _ctrl_pts[cp][1]) *
+                               (_input[1][row][col] - _ctrl_pts[cp][1]) +
+                           (_input[2][row][col] - _ctrl_pts[cp][2]) *
+                               (_input[2][row][col] - _ctrl_pts[cp][2]));
       }
-      gm_rbf_chan:
+    gm_rbf_chan:
       for (int chan = 0; chan < CHAN_SIZE; chan++) {
         float chan_val = 0.0;
-        gm_rbf_cp1:
+      gm_rbf_cp1:
         for (int cp = 0; cp < num_ctrl_pts; cp++) {
           chan_val += l2_dist[cp] * _weights[cp][chan];
         }
@@ -263,25 +258,24 @@ void gamut_map_fxp(float *input, size_t bytes_input,
 }
 
 // Tone mapping
-//void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
+// void tone_map_fxp(float *input, int row_size, int col_size, float *tone_map,
 //                  float *result) {
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   int row_size, int col_size) {
   __visc__hint(DEVICE);
   __visc__attributes(3, input, result, tone_map, 1, result);
-  
+
   printf("Tone mapping.\n");
   ARRAY_3D(float, _input, input, row_size, col_size);
   ARRAY_3D(float, _result, result, row_size, col_size);
   ARRAY_2D(float, _tone_map, tone_map, 3);
 
-  tm_chan:
+tm_chan:
   for (int chan = 0; chan < CHAN_SIZE; chan++)
-    tm_row:
+  tm_row:
     for (int row = 0; row < row_size; row++)
-      tm_col:
+    tm_col:
       for (int col = 0; col < col_size; col++) {
         uint8_t x = _input[chan][row][col] * 255;
         _result[chan][row][col] = _tone_map[x][chan];
diff --git a/hpvm/test/hpvm-cava/src/pipe_stages.h b/hpvm/test/hpvm-cava/src/pipe_stages.h
index 8d98cb65cc8af7353cc1faf08988f3b1a6758046..f960822a03326638189c8d294938452ba2670b41 100644
--- a/hpvm/test/hpvm-cava/src/pipe_stages.h
+++ b/hpvm/test/hpvm-cava/src/pipe_stages.h
@@ -7,54 +7,52 @@
 
 #define ISP 0x4
 
-#define max(a,b) \
-  ({ __typeof__ (a) _a = (a); \
-      __typeof__ (b) _b = (b); \
-    _a > _b ? _a : _b; })
-
-#define min(a,b) \
-  ({ __typeof__ (a) _a = (a); \
-      __typeof__ (b) _b = (b); \
-    _a < _b ? _a : _b; })
-
-#define abs(a) \
-  ({ __typeof__ (a) _a = (a); \
-    _a < 0 ? -_a : _a; })
+#define max(a, b)                                                              \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    __typeof__(b) _b = (b);                                                    \
+    _a > _b ? _a : _b;                                                         \
+  })
+
+#define min(a, b)                                                              \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    __typeof__(b) _b = (b);                                                    \
+    _a < _b ? _a : _b;                                                         \
+  })
+
+#define abs(a)                                                                 \
+  ({                                                                           \
+    __typeof__(a) _a = (a);                                                    \
+    _a < 0 ? -_a : _a;                                                         \
+  })
 
 extern int num_ctrl_pts;
 
-void scale_fxp(uint8_t *input, size_t bytes_input, 
-               float *output, size_t bytes_output,
-               size_t row_size, size_t col_size);
+void scale_fxp(uint8_t *input, size_t bytes_input, float *output,
+               size_t bytes_output, size_t row_size, size_t col_size);
 
-void descale_fxp(float *input, size_t bytes_input, 
-                 uint8_t *output, size_t bytes_result,
-                 size_t row_size, size_t col_size);
+void descale_fxp(float *input, size_t bytes_input, uint8_t *output,
+                 size_t bytes_result, size_t row_size, size_t col_size);
 
-void demosaic_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  size_t row_size, size_t col_size);
+void demosaic_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, size_t row_size, size_t col_size);
 
-void denoise_fxp(float *input, size_t bytes_input, 
-                 float *result, size_t bytes_result,
-                 size_t row_size, size_t col_size);
+void denoise_fxp(float *input, size_t bytes_input, float *result,
+                 size_t bytes_result, size_t row_size, size_t col_size);
 
-void transform_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *TsTw_tran, size_t bytes_TsTw,
+void transform_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *TsTw_tran, size_t bytes_TsTw,
                    size_t row_size, size_t col_size);
 
-void gamut_map_fxp(float *input, size_t bytes_input, 
-                   float *result, size_t bytes_result,
-                   float *ctrl_pts, size_t bytes_ctrl_pts,
-                   float *weights, size_t bytes_weights,
-                   float *coefs, size_t bytes_coefs,
-                   float *l2_dist, size_t bytes_l2_dist,
+void gamut_map_fxp(float *input, size_t bytes_input, float *result,
+                   size_t bytes_result, float *ctrl_pts, size_t bytes_ctrl_pts,
+                   float *weights, size_t bytes_weights, float *coefs,
+                   size_t bytes_coefs, float *l2_dist, size_t bytes_l2_dist,
                    size_t row_size, size_t col_size);
 
-void tone_map_fxp(float *input, size_t bytes_input, 
-                  float *result, size_t bytes_result,
-                  float *tone_map, size_t bytes_tone_map,
+void tone_map_fxp(float *input, size_t bytes_input, float *result,
+                  size_t bytes_result, float *tone_map, size_t bytes_tone_map,
                   size_t row_size, size_t col_size);
 
 void tone_map_approx_fxp(float *input, size_t row_size, size_t col_size,
diff --git a/hpvm/test/hpvm-cava/src/utility.c b/hpvm/test/hpvm-cava/src/utility.c
index c1eaee3333c2afffdcae827f956efa4e25705352..86bd018183403f637ca8fb7cfb634a09c3ceace8 100644
--- a/hpvm/test/hpvm-cava/src/utility.c
+++ b/hpvm/test/hpvm-cava/src/utility.c
@@ -1,7 +1,7 @@
-#include <stdlib.h>
-#include <assert.h>
-#include "defs.h"
 #include "utility.h"
+#include "defs.h"
+#include <assert.h>
+#include <stdlib.h>
 
 void *malloc_aligned(size_t size) {
   void *ptr = NULL;
diff --git a/hpvm/test/hpvm-cava/src/visc.h b/hpvm/test/hpvm-cava/src/visc.h
index 3a05f49e299a0a63a2251db65762561c25ed3981..917aec5a3773657e63655191b7897b9035b6d378 100644
--- a/hpvm/test/hpvm-cava/src/visc.h
+++ b/hpvm/test/hpvm-cava/src/visc.h
@@ -15,62 +15,62 @@
 #ifdef __cplusplus
 extern "C" {
 void __visc__hint(visc::Target);
-//void __visc__wait(void*);
+// void __visc__wait(void*);
 #else
 void __visc__hint(enum Target);
-//void __visc__wait(unsigned);
+// void __visc__wait(unsigned);
 #endif
 
 #ifdef __cplusplus
-//void* __visc__node(...);
-//void* __visc__createNode(...);
-//void* __visc__createNode1D(...);
-//void* __visc__createNode2D(...);
-//void* __visc__createNode3D(...);
-//void __visc__return(...);
+// void* __visc__node(...);
+// void* __visc__createNode(...);
+// void* __visc__createNode1D(...);
+// void* __visc__createNode2D(...);
+// void* __visc__createNode3D(...);
+// void __visc__return(...);
 #endif
 
-void* __visc__createNodeND(unsigned,...);
+void *__visc__createNodeND(unsigned, ...);
 void __visc__return(unsigned, ...);
 
 void __visc__attributes(unsigned, ...);
 void __visc__init();
 void __visc__cleanup();
 
-void __visc__bindIn(void*, unsigned, unsigned, unsigned);
-void __visc__bindOut(void*, unsigned, unsigned, unsigned);
-void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned);
-void __visc__push(void*, void*);
-void* __visc__pop(void*);
-void* __visc__launch(unsigned, ...);
-void __visc__wait(void*);
+void __visc__bindIn(void *, unsigned, unsigned, unsigned);
+void __visc__bindOut(void *, unsigned, unsigned, unsigned);
+void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, unsigned);
+void __visc__push(void *, void *);
+void *__visc__pop(void *);
+void *__visc__launch(unsigned, ...);
+void __visc__wait(void *);
 
-void* __visc__getNode();
-void* __visc__getParentNode(void*);
+void *__visc__getNode();
+void *__visc__getParentNode(void *);
 void __visc__barrier();
-void* __visc__malloc(long);
-long __visc__getNodeInstanceID_x(void*);
-long __visc__getNodeInstanceID_y(void*);
-long __visc__getNodeInstanceID_z(void*);
-long __visc__getNumNodeInstances_x(void*);
-long __visc__getNumNodeInstances_y(void*);
-long __visc__getNumNodeInstances_z(void*);
+void *__visc__malloc(long);
+long __visc__getNodeInstanceID_x(void *);
+long __visc__getNodeInstanceID_y(void *);
+long __visc__getNodeInstanceID_z(void *);
+long __visc__getNumNodeInstances_x(void *);
+long __visc__getNumNodeInstances_y(void *);
+long __visc__getNumNodeInstances_z(void *);
 
 // Atomic
 // signed int
-int __visc__atomic_cmpxchg(int*, int, int);
-int __visc__atomic_add(int*, int);
-int __visc__atomic_sub(int*, int);
-int __visc__atomic_xchg(int*, int);
-int __visc__atomic_inc(int*);
-int __visc__atomic_dec(int*);
-int __visc__atomic_min(int*, int);
-int __visc__atomic_max(int*, int);
-int __visc__atomic_umax(int*, int);
-int __visc__atomic_umin(int*, int);
-int __visc__atomic_and(int*, int);
-int __visc__atomic_or(int*, int);
-int __visc__atomic_xor(int*, int);
+int __visc__atomic_cmpxchg(int *, int, int);
+int __visc__atomic_add(int *, int);
+int __visc__atomic_sub(int *, int);
+int __visc__atomic_xchg(int *, int);
+int __visc__atomic_inc(int *);
+int __visc__atomic_dec(int *);
+int __visc__atomic_min(int *, int);
+int __visc__atomic_max(int *, int);
+int __visc__atomic_umax(int *, int);
+int __visc__atomic_umin(int *, int);
+int __visc__atomic_and(int *, int);
+int __visc__atomic_or(int *, int);
+int __visc__atomic_xor(int *, int);
 
 // Special Func
 float __visc__floor(float);
@@ -79,18 +79,17 @@ float __visc__sqrt(float);
 float __visc__sin(float);
 float __visc__cos(float);
 // unsigned int
-//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
-//unsigned __visc__atomic_add(unsigned*, unsigned);
-//unsigned __visc__atomic_sub(unsigned*, unsigned);
-//unsigned __visc__atomic_xchg(unsigned*, unsigned);
-//unsigned __visc__atomic_inc(unsigned*);
-//unsigned __visc__atomic_dec(unsigned*);
-//unsigned __visc__atomic_min(unsigned*, unsigned);
-//unsigned __visc__atomic_max(unsigned*, unsigned);
-//unsigned __visc__atomic_and(unsigned*, unsigned);
-//unsigned __visc__atomic_or(unsigned*, unsigned);
-//unsigned __visc__atomic_xor(unsigned*, unsigned);
-
+// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
+// unsigned __visc__atomic_add(unsigned*, unsigned);
+// unsigned __visc__atomic_sub(unsigned*, unsigned);
+// unsigned __visc__atomic_xchg(unsigned*, unsigned);
+// unsigned __visc__atomic_inc(unsigned*);
+// unsigned __visc__atomic_dec(unsigned*);
+// unsigned __visc__atomic_min(unsigned*, unsigned);
+// unsigned __visc__atomic_max(unsigned*, unsigned);
+// unsigned __visc__atomic_and(unsigned*, unsigned);
+// unsigned __visc__atomic_or(unsigned*, unsigned);
+// unsigned __visc__atomic_xor(unsigned*, unsigned);
 
 #include <unistd.h>
 
@@ -99,12 +98,10 @@ long get_group_id(int);
 long get_local_id(int);
 long get_local_size(int);
 
-
-void llvm_visc_track_mem(void*, size_t);
-void llvm_visc_untrack_mem(void*);
-void llvm_visc_request_mem(void*, size_t);
+void llvm_visc_track_mem(void *, size_t);
+void llvm_visc_untrack_mem(void *);
+void llvm_visc_request_mem(void *, size_t);
 
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc b/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc
index 24aa24bf8b645ec0669662ec16c16b2b09d7936c..ba55abc2697a854a0eccb269ffd8301a79343b3b 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc
+++ b/hpvm/test/parboil/benchmarks/bfs/src/base/main.cc
@@ -9,204 +9,197 @@
   Implementing Breadth first search on CUDA using algorithm given in DAC'10
   paper "An Effective GPU Implementation of Breadth-First Search"
 
-  Copyright (c) 2010 University of Illinois at Urbana-Champaign. 
+  Copyright (c) 2010 University of Illinois at Urbana-Champaign.
   All rights reserved.
 
-  Permission to use, copy, modify and distribute this software and its documentation for 
-  educational purpose is hereby granted without fee, provided that the above copyright 
-  notice and this permission notice appear in all copies of this software and that you do 
-  not sell the software.
+  Permission to use, copy, modify and distribute this software and its
+  documentation for educational purpose is hereby granted without fee, provided
+  that the above copyright notice and this permission notice appear in all
+  copies of this software and that you do not sell the software.
 
-  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
-  OTHERWISE.
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS,
+  IMPLIED OR OTHERWISE.
 
   Author: Lijiuan Luo (lluo3@uiuc.edu)
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
 #include <deque>
 #include <iostream>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #define MAX_THREADS_PER_BLOCK 512
-#define NUM_SM 30//the number of Streaming Multiprocessors; may change in the future archs 
-#define NUM_SP 16//8//the number of Streaming processors within each SM; may change in the future 
-	//architectures
-#define EXP 4//3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future architecture
-	//using EXP and shifting can speed up division operation 
-#define MOD_OP 8//7 // This variable is also related with NUM_SP; may change in the future architecture;
-	//using MOD_OP and "bitwise and" can speed up mod operation
-#define INF 2147483647//2^31-1
-
-#define UP_LIMIT 16677216//2^24
+#define NUM_SM                                                                 \
+  30 // the number of Streaming Multiprocessors; may change in the future archs
+#define NUM_SP                                                                 \
+  16 // 8//the number of Streaming processors within each SM; may change in the
+     // future
+// architectures
+#define EXP                                                                    \
+  4 // 3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future
+    // architecture
+// using EXP and shifting can speed up division operation
+#define MOD_OP                                                                 \
+  8 // 7 // This variable is also related with NUM_SP; may change in the future
+    // architecture;
+// using MOD_OP and "bitwise and" can speed up mod operation
+#define INF 2147483647 // 2^31-1
+
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
 #define GRAY1 16677220
 #define BLACK 16677221
-int no_of_nodes; //the number of nodes in the graph
-int edge_list_size;//the number of edges in the graph
+int no_of_nodes;    // the number of nodes in the graph
+int edge_list_size; // the number of edges in the graph
 FILE *fp;
 
-//typedef int2 Node;
-//typedef int2 Edge;
+// typedef int2 Node;
+// typedef int2 Edge;
 
-struct Node{
-    int x;
-    int y;
+struct Node {
+  int x;
+  int y;
 };
 
-struct Edge{
-    int x;
-    int y;
+struct Edge {
+  int x;
+  int y;
 };
-//Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables for initialization
+// Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables
+// for initialization
 const int h_top = 1;
 const int zero = 0;
 
-void runCPU(int argc, char** argv);
-void runGPU(int argc, char** argv);
+void runCPU(int argc, char **argv);
+void runGPU(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////
-//the cpu version of bfs for speed comparison
-//the text book version ("Introduction to Algorithms")
+// the cpu version of bfs for speed comparison
+// the text book version ("Introduction to Algorithms")
 ////////////////////////////////////////////////////////////////////
-void  BFS_CPU( Node * h_graph_nodes,Edge * h_graph_edges,
-	int * color, int * h_cost, int source){
-	std::deque<int> wavefront;	
-	wavefront.push_back(source);
-	color[source] = GRAY;
-	int index;
-	while(!wavefront.empty()){
-		index = wavefront.front();
-		wavefront.pop_front();
-		for(int i=h_graph_nodes[index].x; 
-			i<(h_graph_nodes[index].y + 
-				h_graph_nodes[index].x); i++)
-			{
-			int id = h_graph_edges[i].x;
-			if(color[id] == WHITE){
-				h_cost[id]=h_cost[index]+1;
-				wavefront.push_back(id);
-				color[id] = GRAY;
-			}
-		}
-		color[index] = BLACK;
-		
-		
-	}
-	
+void BFS_CPU(Node *h_graph_nodes, Edge *h_graph_edges, int *color, int *h_cost,
+             int source) {
+  std::deque<int> wavefront;
+  wavefront.push_back(source);
+  color[source] = GRAY;
+  int index;
+  while (!wavefront.empty()) {
+    index = wavefront.front();
+    wavefront.pop_front();
+    for (int i = h_graph_nodes[index].x;
+         i < (h_graph_nodes[index].y + h_graph_nodes[index].x); i++) {
+      int id = h_graph_edges[i].x;
+      if (color[id] == WHITE) {
+        h_cost[id] = h_cost[index] + 1;
+        wavefront.push_back(id);
+        color[id] = GRAY;
+      }
+    }
+    color[index] = BLACK;
+  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
-int main( int argc, char** argv) 
-{
-	no_of_nodes=0;
-	edge_list_size=0;
-	runCPU(argc,argv);
-//	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
-//		cutilDeviceInit(argc, argv);
-//	else
-		//cudaSetDevice( cutGetMaxGflopsDeviceId() );
-//		cudaSetDevice( 1);
-
-
-	//CUT_EXIT(argc, argv);
+int main(int argc, char **argv) {
+  no_of_nodes = 0;
+  edge_list_size = 0;
+  runCPU(argc, argv);
+  //	if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
+  //		cutilDeviceInit(argc, argv);
+  //	else
+  // cudaSetDevice( cutGetMaxGflopsDeviceId() );
+  //		cudaSetDevice( 1);
+
+  // CUT_EXIT(argc, argv);
 }
 ///////////////////////////////
-//FUNCTION: only run CPU version 
+// FUNCTION: only run CPU version
 ////////////////////////////////////////////
-void runCPU( int argc, char** argv) 
-{
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    pb_InitializeTimerSet(&timers);
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-        fprintf(stderr, "Expecting one input filename\n");
-        exit(-1);
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	//printf("Reading File\n");
-	//Read in Graph from a file
-	fp = fopen(params->inpFiles[0],"r");
-	if(!fp)
-	{
-		printf("Error Reading graph file\n");
-		return;
-	}
-
-	int source;
-
-	fscanf(fp,"%d",&no_of_nodes);
-	// allocate host memory
-	Node* h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
-	int *color = (int*) malloc(sizeof(int)*no_of_nodes);
-	int start, edgeno;   
-	// initalize the memory
-	for( unsigned int i = 0; i < no_of_nodes; i++) 
-	{
-		fscanf(fp,"%d %d",&start,&edgeno);
-		h_graph_nodes[i].x = start;
-		h_graph_nodes[i].y = edgeno;
-		color[i]=WHITE;
-	}
-	//read the source node from the file
-	fscanf(fp,"%d",&source);
-	fscanf(fp,"%d",&edge_list_size);
-	int id,cost;
-	Edge* h_graph_edges = (Edge*) malloc(sizeof(Edge)*edge_list_size);
-	for(int i=0; i < edge_list_size ; i++)
-	{
-		fscanf(fp,"%d",&id);
-		fscanf(fp,"%d",&cost);
-		h_graph_edges[i].x = id;
-		h_graph_edges[i].y = cost;
-	}
-	if(fp)
-		fclose(fp);    
-
-	//printf("Read File\n");
-
-	// allocate mem for the result on host side
-	int* h_cost = (int*) malloc( sizeof(int)*no_of_nodes);
-	for(int i = 0; i < no_of_nodes; i++){
-		h_cost[i] = INF;
-	}
-	h_cost[source] = 0;
-	//printf("start cpu version\n");
-	unsigned int cpu_timer = 0;
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	BFS_CPU( h_graph_nodes, h_graph_edges, color, h_cost,  source 
-		 );
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if(params->outFile!=NULL)
-    {
-        //printf("Result stored in %s\n", params->outFile);
-        FILE *fp = fopen(params->outFile,"w");
-        fprintf(fp,"%d\n", no_of_nodes);
-    	for(int i=0;i<no_of_nodes;i++)
-    		fprintf(fp,"%d %d\n",i,h_cost[i]);
-    	fclose(fp);
-    }
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	// cleanup memory
-	free( h_graph_nodes);
-	free( h_graph_edges);
-	free( color);
-	free( h_cost);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-    pb_FreeParameters(params);
+void runCPU(int argc, char **argv) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  pb_InitializeTimerSet(&timers);
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // printf("Reading File\n");
+  // Read in Graph from a file
+  fp = fopen(params->inpFiles[0], "r");
+  if (!fp) {
+    printf("Error Reading graph file\n");
+    return;
+  }
+
+  int source;
+
+  fscanf(fp, "%d", &no_of_nodes);
+  // allocate host memory
+  Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
+  int *color = (int *)malloc(sizeof(int) * no_of_nodes);
+  int start, edgeno;
+  // initalize the memory
+  for (unsigned int i = 0; i < no_of_nodes; i++) {
+    fscanf(fp, "%d %d", &start, &edgeno);
+    h_graph_nodes[i].x = start;
+    h_graph_nodes[i].y = edgeno;
+    color[i] = WHITE;
+  }
+  // read the source node from the file
+  fscanf(fp, "%d", &source);
+  fscanf(fp, "%d", &edge_list_size);
+  int id, cost;
+  Edge *h_graph_edges = (Edge *)malloc(sizeof(Edge) * edge_list_size);
+  for (int i = 0; i < edge_list_size; i++) {
+    fscanf(fp, "%d", &id);
+    fscanf(fp, "%d", &cost);
+    h_graph_edges[i].x = id;
+    h_graph_edges[i].y = cost;
+  }
+  if (fp)
+    fclose(fp);
+
+  // printf("Read File\n");
+
+  // allocate mem for the result on host side
+  int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
+  for (int i = 0; i < no_of_nodes; i++) {
+    h_cost[i] = INF;
+  }
+  h_cost[source] = 0;
+  // printf("start cpu version\n");
+  unsigned int cpu_timer = 0;
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  BFS_CPU(h_graph_nodes, h_graph_edges, color, h_cost, source);
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (params->outFile != NULL) {
+    // printf("Result stored in %s\n", params->outFile);
+    FILE *fp = fopen(params->outFile, "w");
+    fprintf(fp, "%d\n", no_of_nodes);
+    for (int i = 0; i < no_of_nodes; i++)
+      fprintf(fp, "%d %d\n", i, h_cost[i]);
+    fclose(fp);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // cleanup memory
+  free(h_graph_nodes);
+  free(h_graph_edges);
+  free(color);
+  free(h_cost);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
 }
 ///////////////////////////////
-//FUNCTION:only run GPU version 
+// FUNCTION:only run GPU version
 ////////////////////////////////////////////
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h b/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h
index 18039547e2244b33f30f02ea4df1edc445debcf8..e5e2420355ef67bcd9f638c5d4e48ee47e657942 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/cuda/config.h
@@ -1,12 +1,19 @@
 #define MAX_THREADS_PER_BLOCK 512
-#define NUM_SM 14 //the number of Streaming Multiprocessors; 15 for Fermi architecture 30 for G280 at the moment of this document
-#define NUM_BIN 8 //the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU
-#define EXP 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future architecture
-	//using EXP and shifting can speed up division operation 
-#define MOD_OP 7 // This variable is also related with NUM_BIN; may change in the future architecture;
-	//using MOD_OP and "bitwise and" can speed up mod operation
-#define INF 2147483647//2^31-1
-#define UP_LIMIT 16677216//2^24
+#define NUM_SM                                                                 \
+  14 // the number of Streaming Multiprocessors; 15 for Fermi architecture 30
+     // for G280 at the moment of this document
+#define NUM_BIN                                                                \
+  8 // the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU
+#define EXP                                                                    \
+  3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future
+    // architecture
+    // using EXP and shifting can speed up division operation
+#define MOD_OP                                                                 \
+  7 // This variable is also related with NUM_BIN; may change in the future
+    // architecture;
+    // using MOD_OP and "bitwise and" can speed up mod operation
+#define INF 2147483647    // 2^31-1
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h b/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h
index 18039547e2244b33f30f02ea4df1edc445debcf8..e5e2420355ef67bcd9f638c5d4e48ee47e657942 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/cuda_base/config.h
@@ -1,12 +1,19 @@
 #define MAX_THREADS_PER_BLOCK 512
-#define NUM_SM 14 //the number of Streaming Multiprocessors; 15 for Fermi architecture 30 for G280 at the moment of this document
-#define NUM_BIN 8 //the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU
-#define EXP 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future architecture
-	//using EXP and shifting can speed up division operation 
-#define MOD_OP 7 // This variable is also related with NUM_BIN; may change in the future architecture;
-	//using MOD_OP and "bitwise and" can speed up mod operation
-#define INF 2147483647//2^31-1
-#define UP_LIMIT 16677216//2^24
+#define NUM_SM                                                                 \
+  14 // the number of Streaming Multiprocessors; 15 for Fermi architecture 30
+     // for G280 at the moment of this document
+#define NUM_BIN                                                                \
+  8 // the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU
+#define EXP                                                                    \
+  3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future
+    // architecture
+    // using EXP and shifting can speed up division operation
+#define MOD_OP                                                                 \
+  7 // This variable is also related with NUM_BIN; may change in the future
+    // architecture;
+    // using MOD_OP and "bitwise and" can speed up mod operation
+#define INF 2147483647    // 2^31-1
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc b/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc
index d5d91ea4ccef7f03b788d41b06f5d7f12a57f4ac..01664c78345db542b37530b77eed54eb3c1fd1cd 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/bfs/src/omp_base/main.cc
@@ -9,200 +9,195 @@
   Implementing Breadth first search on CUDA using algorithm given in DAC'10
   paper "An Effective GPU Implementation of Breadth-First Search"
 
-  Copyright (c) 2010 University of Illinois at Urbana-Champaign. 
+  Copyright (c) 2010 University of Illinois at Urbana-Champaign.
   All rights reserved.
 
-  Permission to use, copy, modify and distribute this software and its documentation for 
-  educational purpose is hereby granted without fee, provided that the above copyright 
-  notice and this permission notice appear in all copies of this software and that you do 
-  not sell the software.
+  Permission to use, copy, modify and distribute this software and its
+  documentation for educational purpose is hereby granted without fee, provided
+  that the above copyright notice and this permission notice appear in all
+  copies of this software and that you do not sell the software.
 
-  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
-  OTHERWISE.
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS,
+  IMPLIED OR OTHERWISE.
 
   Author: Lijiuan Luo (lluo3@uiuc.edu)
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
 #include <deque>
 #include <iostream>
+#include <math.h>
+#include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #define MAX_THREADS_PER_BLOCK 512
-#define NUM_SM 30//the number of Streaming Multiprocessors; may change in the future archs 
-#define NUM_SP 16//8//the number of Streaming processors within each SM; may change in the future 
-	//architectures
-#define EXP 4//3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future architecture
-	//using EXP and shifting can speed up division operation 
-#define MOD_OP 8//7 // This variable is also related with NUM_SP; may change in the future architecture;
-	//using MOD_OP and "bitwise and" can speed up mod operation
-#define INF 2147483647//2^31-1
-
-#define UP_LIMIT 16677216//2^24
+#define NUM_SM                                                                 \
+  30 // the number of Streaming Multiprocessors; may change in the future archs
+#define NUM_SP                                                                 \
+  16 // 8//the number of Streaming processors within each SM; may change in the
+     // future
+// architectures
+#define EXP                                                                    \
+  4 // 3// EXP = log(NUM_SP), assuming NUM_SP is still power of 2 in the future
+    // architecture
+// using EXP and shifting can speed up division operation
+#define MOD_OP                                                                 \
+  8 // 7 // This variable is also related with NUM_SP; may change in the future
+    // architecture;
+// using MOD_OP and "bitwise and" can speed up mod operation
+#define INF 2147483647 // 2^31-1
+
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
 #define GRAY1 16677220
 #define BLACK 16677221
-int no_of_nodes; //the number of nodes in the graph
-int edge_list_size;//the number of edges in the graph
+int no_of_nodes;    // the number of nodes in the graph
+int edge_list_size; // the number of edges in the graph
 FILE *fp;
 
-//typedef int2 Node;
-//typedef int2 Edge;
+// typedef int2 Node;
+// typedef int2 Edge;
 
-struct Node{
-    int x;
-    int y;
+struct Node {
+  int x;
+  int y;
 };
 
-struct Edge{
-    int x;
-    int y;
+struct Edge {
+  int x;
+  int y;
 };
 
 const int h_top = 1;
 const int zero = 0;
 
-void runCPU(int argc, char** argv);
-void runGPU(int argc, char** argv);
+void runCPU(int argc, char **argv);
+void runGPU(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////
-//the cpu version of bfs for speed comparison
-//the text book version ("Introduction to Algorithms")
+// the cpu version of bfs for speed comparison
+// the text book version ("Introduction to Algorithms")
 ////////////////////////////////////////////////////////////////////
-void  BFS_CPU( Node * h_graph_nodes,Edge * h_graph_edges,
-	int * color, int * h_cost, int source){
-	std::deque<int> wavefront;	
-	wavefront.push_back(source);
-	color[source] = GRAY;
-	int index;
-	while(!wavefront.empty()){
-		index = wavefront.front();
-		wavefront.pop_front();
+void BFS_CPU(Node *h_graph_nodes, Edge *h_graph_edges, int *color, int *h_cost,
+             int source) {
+  std::deque<int> wavefront;
+  wavefront.push_back(source);
+  color[source] = GRAY;
+  int index;
+  while (!wavefront.empty()) {
+    index = wavefront.front();
+    wavefront.pop_front();
 
 #pragma omp parallel for
-		for(int i=h_graph_nodes[index].x; 
-			i<(h_graph_nodes[index].y + 
-				h_graph_nodes[index].x); i++)
-			{
-				int id = h_graph_edges[i].x;
-				if(color[id] == WHITE){
-					h_cost[id]=h_cost[index]+1;
+    for (int i = h_graph_nodes[index].x;
+         i < (h_graph_nodes[index].y + h_graph_nodes[index].x); i++) {
+      int id = h_graph_edges[i].x;
+      if (color[id] == WHITE) {
+        h_cost[id] = h_cost[index] + 1;
 
 #pragma omp critical
-					wavefront.push_back(id);
-
-					color[id] = GRAY;
-				}
-			}
-			color[index] = BLACK;
-	}
-	
+        wavefront.push_back(id);
+
+        color[id] = GRAY;
+      }
+    }
+    color[index] = BLACK;
+  }
 }
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
-int main( int argc, char** argv) 
-{
-	no_of_nodes=0;
-	edge_list_size=0;
-	runCPU(argc,argv);
+int main(int argc, char **argv) {
+  no_of_nodes = 0;
+  edge_list_size = 0;
+  runCPU(argc, argv);
 }
 
 ///////////////////////////////
-//FUNCTION: only run CPU version 
+// FUNCTION: only run CPU version
 ////////////////////////////////////////////
-void runCPU( int argc, char** argv) 
-{
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    pb_InitializeTimerSet(&timers);
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-        fprintf(stderr, "Expecting one input filename\n");
-        exit(-1);
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	//printf("Reading File\n");
-	//Read in Graph from a file
-	fp = fopen(params->inpFiles[0],"r");
-	if(!fp)
-	{
-		printf("Error Reading graph file\n");
-		return;
-	}
-
-	int source;
-
-	fscanf(fp,"%d",&no_of_nodes);
-	// allocate host memory
-	Node* h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
-	int *color = (int*) malloc(sizeof(int)*no_of_nodes);
-	int start, edgeno;   
-	// initalize the memory
-	for( unsigned int i = 0; i < no_of_nodes; i++) 
-	{
-		fscanf(fp,"%d %d",&start,&edgeno);
-		h_graph_nodes[i].x = start;
-		h_graph_nodes[i].y = edgeno;
-		color[i]=WHITE;
-	}
-	//read the source node from the file
-	fscanf(fp,"%d",&source);
-	fscanf(fp,"%d",&edge_list_size);
-	int id,cost;
-	Edge* h_graph_edges = (Edge*) malloc(sizeof(Edge)*edge_list_size);
-	for(int i=0; i < edge_list_size ; i++)
-	{
-		fscanf(fp,"%d",&id);
-		fscanf(fp,"%d",&cost);
-		h_graph_edges[i].x = id;
-		h_graph_edges[i].y = cost;
-	}
-	if(fp)
-		fclose(fp);    
-
-	//printf("Read File\n");
-
-	// allocate mem for the result on host side
-	int* h_cost = (int*) malloc( sizeof(int)*no_of_nodes);
-	for(int i = 0; i < no_of_nodes; i++){
-		h_cost[i] = INF;
-	}
-	h_cost[source] = 0;
-	//printf("start cpu version\n");
-	unsigned int cpu_timer = 0;
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	BFS_CPU( h_graph_nodes, h_graph_edges, color, h_cost,  source 
-		 );
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if(params->outFile!=NULL)
-    {
-        //printf("Result stored in %s\n", params->outFile);
-        FILE *fp = fopen(params->outFile,"w");
-        fprintf(fp,"%d\n", no_of_nodes);
-    	for(int i=0;i<no_of_nodes;i++)
-    		fprintf(fp,"%d %d\n",i,h_cost[i]);
-    	fclose(fp);
-    }
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	// cleanup memory
-	free( h_graph_nodes);
-	free( h_graph_edges);
-	free( color);
-	free( h_cost);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-    pb_FreeParameters(params);
+void runCPU(int argc, char **argv) {
+
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
+
+  pb_InitializeTimerSet(&timers);
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // printf("Reading File\n");
+  // Read in Graph from a file
+  fp = fopen(params->inpFiles[0], "r");
+  if (!fp) {
+    printf("Error Reading graph file\n");
+    return;
+  }
+
+  int source;
+
+  fscanf(fp, "%d", &no_of_nodes);
+  // allocate host memory
+  Node *h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
+  int *color = (int *)malloc(sizeof(int) * no_of_nodes);
+  int start, edgeno;
+  // initalize the memory
+  for (unsigned int i = 0; i < no_of_nodes; i++) {
+    fscanf(fp, "%d %d", &start, &edgeno);
+    h_graph_nodes[i].x = start;
+    h_graph_nodes[i].y = edgeno;
+    color[i] = WHITE;
+  }
+  // read the source node from the file
+  fscanf(fp, "%d", &source);
+  fscanf(fp, "%d", &edge_list_size);
+  int id, cost;
+  Edge *h_graph_edges = (Edge *)malloc(sizeof(Edge) * edge_list_size);
+  for (int i = 0; i < edge_list_size; i++) {
+    fscanf(fp, "%d", &id);
+    fscanf(fp, "%d", &cost);
+    h_graph_edges[i].x = id;
+    h_graph_edges[i].y = cost;
+  }
+  if (fp)
+    fclose(fp);
+
+  // printf("Read File\n");
+
+  // allocate mem for the result on host side
+  int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
+  for (int i = 0; i < no_of_nodes; i++) {
+    h_cost[i] = INF;
+  }
+  h_cost[source] = 0;
+  // printf("start cpu version\n");
+  unsigned int cpu_timer = 0;
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  BFS_CPU(h_graph_nodes, h_graph_edges, color, h_cost, source);
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (params->outFile != NULL) {
+    // printf("Result stored in %s\n", params->outFile);
+    FILE *fp = fopen(params->outFile, "w");
+    fprintf(fp, "%d\n", no_of_nodes);
+    for (int i = 0; i < no_of_nodes; i++)
+      fprintf(fp, "%d %d\n", i, h_cost[i]);
+    fclose(fp);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // cleanup memory
+  free(h_graph_nodes);
+  free(h_graph_edges);
+  free(color);
+  free(h_cost);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
 }
 ///////////////////////////////
-//FUNCTION:only run GPU version 
+// FUNCTION:only run GPU version
 ////////////////////////////////////////////
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp
index 57368eda9ada364e6edf6e1eccd35758fa349b62..38e60a1cbff3d9e4ce8d56204e9213943ea4fd55 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.cpp
@@ -4,41 +4,47 @@
 #include <string.h>
 
 // -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
+int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device,
+                    cl_device_type *reqDeviceType, int numRequests, ...) {
+
+  // Supported Device Requests (anything that returns cl_bool)
+  //   CL_DEVICE_IMAGE_SUPPORT
+  //   CL_DEVICE_HOST_UNIFIED_MEMORY
+  //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
+  //   CL_DEVICE_AVAILABLE
+  //   CL_DEVICE_COMPILER_AVAILABLE
+
   cl_uint numEntries = 16;
   cl_platform_id clPlatforms[numEntries];
   cl_uint numPlatforms;
-  
+
   cl_device_id clDevices[numEntries];
   cl_uint numDevices;
 
-  OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
+  OCL_SIMPLE_ERRCK_RETVAL(
+      clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms));
   fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
   bool needDevice = true;
-  
+
   for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
 
     cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
+
+    OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL,
+                                           numEntries, clDevices, &numDevices));
+    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip,
+            numDevices);
+
+    for (int id = 0; (id < numDevices) && needDevice; ++id) {
       cl_device_id clDevice = clDevices[id];
       cl_device_type clDeviceType;
 
       bool canSatisfy = true;
-      
+
       if (reqDeviceType != NULL) {
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
+        OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceInfo(clDevice, CL_DEVICE_TYPE,
+                                                sizeof(cl_device_type),
+                                                &clDeviceType, NULL));
         if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
           if (*reqDeviceType != clDeviceType) {
             canSatisfy = false;
@@ -48,32 +54,34 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty
 
       va_list paramList;
       va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
+      for (int i = 0; (i < numRequests) && canSatisfy; ++i) {
+
+        cl_device_info devReq = va_arg(paramList, cl_device_info);
         cl_bool clInfoBool;
         size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
+
+        OCL_SIMPLE_ERRCK_RETVAL(
+            clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
         if (clInfoBool != true) {
           canSatisfy = false;
         }
       }
-      
+
       va_end(paramList);
       if (canSatisfy) {
         *device = clDevice;
         *platform = clPlatform;
         needDevice = false;
         fprintf(stderr, "Chose Device Type: %s\n",
-          (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"
-          );
+                (clDeviceType == CL_DEVICE_TYPE_CPU)
+                    ? "CPU"
+                    : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other");
         if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
           *reqDeviceType = clDeviceType;
         }
       }
     } // End checking all devices for a platform
-  } // End checking all platforms
+  }   // End checking all platforms
 
   int retVal = -1;
   if (needDevice) {
@@ -81,214 +89,213 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty
   } else {
     retVal = 0;
   }
-  
+
   return retVal;
 }
 
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
+const char *oclErrorString(cl_int error) {
+  // From NVIDIA SDK
+  static const char *errorString[] = {
+      "CL_SUCCESS",
+      "CL_DEVICE_NOT_FOUND",
+      "CL_DEVICE_NOT_AVAILABLE",
+      "CL_COMPILER_NOT_AVAILABLE",
+      "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+      "CL_OUT_OF_RESOURCES",
+      "CL_OUT_OF_HOST_MEMORY",
+      "CL_PROFILING_INFO_NOT_AVAILABLE",
+      "CL_MEM_COPY_OVERLAP",
+      "CL_IMAGE_FORMAT_MISMATCH",
+      "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+      "CL_BUILD_PROGRAM_FAILURE",
+      "CL_MAP_FAILURE",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "CL_INVALID_VALUE",
+      "CL_INVALID_DEVICE_TYPE",
+      "CL_INVALID_PLATFORM",
+      "CL_INVALID_DEVICE",
+      "CL_INVALID_CONTEXT",
+      "CL_INVALID_QUEUE_PROPERTIES",
+      "CL_INVALID_COMMAND_QUEUE",
+      "CL_INVALID_HOST_PTR",
+      "CL_INVALID_MEM_OBJECT",
+      "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+      "CL_INVALID_IMAGE_SIZE",
+      "CL_INVALID_SAMPLER",
+      "CL_INVALID_BINARY",
+      "CL_INVALID_BUILD_OPTIONS",
+      "CL_INVALID_PROGRAM",
+      "CL_INVALID_PROGRAM_EXECUTABLE",
+      "CL_INVALID_KERNEL_NAME",
+      "CL_INVALID_KERNEL_DEFINITION",
+      "CL_INVALID_KERNEL",
+      "CL_INVALID_ARG_INDEX",
+      "CL_INVALID_ARG_VALUE",
+      "CL_INVALID_ARG_SIZE",
+      "CL_INVALID_KERNEL_ARGS",
+      "CL_INVALID_WORK_DIMENSION",
+      "CL_INVALID_WORK_GROUP_SIZE",
+      "CL_INVALID_WORK_ITEM_SIZE",
+      "CL_INVALID_GLOBAL_OFFSET",
+      "CL_INVALID_EVENT_WAIT_LIST",
+      "CL_INVALID_EVENT",
+      "CL_INVALID_OPERATION",
+      "CL_INVALID_GL_OBJECT",
+      "CL_INVALID_BUFFER_SIZE",
+      "CL_INVALID_MIP_LEVEL",
+      "CL_INVALID_GLOBAL_WORK_SIZE",
+  };
+
+  const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
+
+  const int index = -error;
+
+  return (index >= 0 && index < errorCount) ? errorString[index] : "";
 }
 
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	  cl_uint maxMemAlloc = 0;
-	  OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
+const char *oclDebugErrString(cl_int error, cl_device_id device) {
+  // From NVIDIA SDK
+  static const char *errorString[] = {
+      "CL_SUCCESS",
+      "CL_DEVICE_NOT_FOUND",
+      "CL_DEVICE_NOT_AVAILABLE",
+      "CL_COMPILER_NOT_AVAILABLE",
+      "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+      "CL_OUT_OF_RESOURCES",
+      "CL_OUT_OF_HOST_MEMORY",
+      "CL_PROFILING_INFO_NOT_AVAILABLE",
+      "CL_MEM_COPY_OVERLAP",
+      "CL_IMAGE_FORMAT_MISMATCH",
+      "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+      "CL_BUILD_PROGRAM_FAILURE",
+      "CL_MAP_FAILURE",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "CL_INVALID_VALUE",
+      "CL_INVALID_DEVICE_TYPE",
+      "CL_INVALID_PLATFORM",
+      "CL_INVALID_DEVICE",
+      "CL_INVALID_CONTEXT",
+      "CL_INVALID_QUEUE_PROPERTIES",
+      "CL_INVALID_COMMAND_QUEUE",
+      "CL_INVALID_HOST_PTR",
+      "CL_INVALID_MEM_OBJECT",
+      "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+      "CL_INVALID_IMAGE_SIZE",
+      "CL_INVALID_SAMPLER",
+      "CL_INVALID_BINARY",
+      "CL_INVALID_BUILD_OPTIONS",
+      "CL_INVALID_PROGRAM",
+      "CL_INVALID_PROGRAM_EXECUTABLE",
+      "CL_INVALID_KERNEL_NAME",
+      "CL_INVALID_KERNEL_DEFINITION",
+      "CL_INVALID_KERNEL",
+      "CL_INVALID_ARG_INDEX",
+      "CL_INVALID_ARG_VALUE",
+      "CL_INVALID_ARG_SIZE",
+      "CL_INVALID_KERNEL_ARGS",
+      "CL_INVALID_WORK_DIMENSION",
+      "CL_INVALID_WORK_GROUP_SIZE",
+      "CL_INVALID_WORK_ITEM_SIZE",
+      "CL_INVALID_GLOBAL_OFFSET",
+      "CL_INVALID_EVENT_WAIT_LIST",
+      "CL_INVALID_EVENT",
+      "CL_INVALID_OPERATION",
+      "CL_INVALID_GL_OBJECT",
+      "CL_INVALID_BUFFER_SIZE",
+      "CL_INVALID_MIP_LEVEL",
+      "CL_INVALID_GLOBAL_WORK_SIZE",
+  };
+
+  const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
+
+  const int index = -error;
+
+  if (index == 4) {
+    cl_uint maxMemAlloc = 0;
+    OCL_SIMPLE_ERRCK_RETVAL(
+        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong),
+                        &maxMemAlloc, NULL));
+    fprintf(stderr, "  Device Maximum block allocation size: %lu\n",
+            maxMemAlloc);
+  }
+
+  return (index >= 0 && index < errorCount) ? errorString[index] : "";
 }
 
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
+char *oclLoadProgSource(const char *cFilename, const char *cPreamble,
+                        size_t *szFinalLength) {
+  // locals
+  FILE *pFileStream = NULL;
+  size_t szSourceLength;
+
+// open the OpenCL source code file
+#ifdef _WIN32 // Windows version
+  if (fopen_s(&pFileStream, cFilename, "rb") != 0) {
+    return NULL;
+  }
+#else // Linux version
+  pFileStream = fopen(cFilename, "rb");
+  if (pFileStream == 0) {
+    return NULL;
+  }
+#endif
+
+  size_t szPreambleLength = strlen(cPreamble);
+
+  // get the length of the source code
+  fseek(pFileStream, 0, SEEK_END);
+  szSourceLength = ftell(pFileStream);
+  fseek(pFileStream, 0, SEEK_SET);
+
+  // allocate a buffer for the source code string and read it in
+  char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
+  memcpy(cSourceString, cPreamble, szPreambleLength);
+  if (fread((cSourceString) + szPreambleLength, szSourceLength, 1,
+            pFileStream) != 1) {
     fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
+    free(cSourceString);
+    return 0;
+  }
+
+  // close the file and return the total length of the combined (preamble +
+  // source) string
+  fclose(pFileStream);
+  if (szFinalLength != 0) {
+    *szFinalLength = szSourceLength + szPreambleLength;
+  }
+  cSourceString[szSourceLength + szPreambleLength] = '\0';
 
-    return cSourceString;
+  return cSourceString;
 }
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h
index 976c692055501532d65a1ac25e74630732fd2a86..27b084487c6289196337ca064b94f1353f8bbbad 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/OpenCL_common.h
@@ -2,26 +2,40 @@
 #ifndef __OPENCL_COMMON_H_
 #define __OPENCL_COMMON_H_
 
-#include <stdio.h>
+#include <CL/cl.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
-#include <CL/cl.h>
 
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); }
-    
-#define OCL_SIMPLE_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device,
+                    cl_device_type *reqDeviceType, int numRequests, ...);
+const char *oclErrorString(cl_int error);
+const char *oclDebugErrString(cl_int error, cl_device_id device);
+
+#define OCL_ERRCK_VAR(var)                                                     \
+  {                                                                            \
+    if (var != CL_SUCCESS)                                                     \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclErrorString(var));                                            \
+  }
+
+#define OCL_ERRCK_RETVAL(s)                                                    \
+  {                                                                            \
+    cl_int clerr = (s);                                                        \
+    if (clerr != CL_SUCCESS)                                                   \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclDebugErrString(clerr, clDevice));                             \
+  }
+
+#define OCL_SIMPLE_ERRCK_RETVAL(s)                                             \
+  {                                                                            \
+    cl_int clerr = (s);                                                        \
+    if (clerr != CL_SUCCESS)                                                   \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclErrorString(clerr));                                          \
+  }
+
+char *oclLoadProgSource(const char *cFilename, const char *cPreamble,
+                        size_t *szFinalLength);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h
index 1a00ef98e054e50e654b0a52ccbb05ce136bab27..f9cdb59e9cd6cc39364fd9389ee39216646aedb2 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/config.h
@@ -1,7 +1,8 @@
 #define MAX_THREADS_PER_BLOCK 256
-#define LOCAL_MEM_SIZE 1600 //This needs to be adjusted for certain graphs with high degrees
-#define INF 2147483647//2^31-1
-#define UP_LIMIT 16677216//2^24
+#define LOCAL_MEM_SIZE                                                         \
+  1600 // This needs to be adjusted for certain graphs with high degrees
+#define INF 2147483647    // 2^31-1
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp
index c38fcec8b895eb2559d7c4f1fc974ad4b2cf97e3..278c1bf085c9a5f0ea4809b61806c1a647e9afe5 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_base/main.cpp
@@ -12,61 +12,56 @@
   Copyright (c) 2010 University of Illinois at Urbana-Champaign.
   All rights reserved.
 
-  Permission to use, copy, modify and distribute this software and its documentation for
-  educational purpose is hereby granted without fee, provided that the above copyright
-  notice and this permission notice appear in all copies of this software and that you do
-  not sell the software.
+  Permission to use, copy, modify and distribute this software and its
+  documentation for educational purpose is hereby granted without fee, provided
+  that the above copyright notice and this permission notice appear in all
+  copies of this software and that you do not sell the software.
 
-  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
-  OTHERWISE.
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS,
+  IMPLIED OR OTHERWISE.
 
   Author: Lijiuan Luo (lluo3@uiuc.edu)
-  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu)
+  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu
+  (gengliu2@illinois.edu)
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <CL/cl.h>
-#include "parboil.h"
 #include "OpenCL_common.h"
 #include "config.h"
+#include "parboil.h"
+#include <CL/cl.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
-
-#define CHECK_ERROR(errorMessage)        \
-if(clStatus != CL_SUCCESS)               \
-{                                        \
-  printf("Error: %s!\n",errorMessage);   \
-  printf("Line: %d\n",__LINE__);         \
-  exit(1);                               \
-}
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
+  }
 
 FILE *fp;
-char* readFile(const char* fileName)
-{
-  FILE* fp;
-  fp = fopen(fileName,"r");
-  if(fp == NULL)
-  {
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
     printf("Error 1!\n");
     exit(1);
   }
 
-  fseek(fp,0,SEEK_END);
+  fseek(fp, 0, SEEK_END);
   long size = ftell(fp);
   rewind(fp);
 
-  char* buffer = (char*)malloc(sizeof(char)*size);
-  if(buffer  == NULL)
-  {
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
     printf("Error 2!\n");
     fclose(fp);
     exit(1);
   }
 
-  size_t res = fread(buffer,1,size,fp);
-  if(res != size)
-  {
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
     printf("Error 3!\n");
     fclose(fp);
     exit(1);
@@ -77,70 +72,67 @@ char* readFile(const char* fileName)
 }
 const int h_top = 1;
 const int zero = 0;
-void runGPU(int argc, char** argv);
+void runGPU(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
-int main( int argc, char** argv)
-{
+int main(int argc, char **argv) {
 
-  //the number of nodes in the graph
+  // the number of nodes in the graph
   int num_of_nodes = 0;
-  //the number of edges in the graph
+  // the number of edges in the graph
   int num_of_edges = 0;
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
 
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-  {
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) {
     fprintf(stderr, "Expecting one input filename\n");
     exit(-1);
   }
 
-  //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  //Read in Graph from a file
-  fp = fopen(params->inpFiles[0],"r");
-  if(!fp)
-  {
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // Read in Graph from a file
+  fp = fopen(params->inpFiles[0], "r");
+  if (!fp) {
     printf("Error Reading graph file\n");
     return 0;
   }
   int source;
 
-  fscanf(fp,"%d",&num_of_nodes);
+  fscanf(fp, "%d", &num_of_nodes);
   // allocate host memory
-  struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes);
-  int *color = (int*) malloc(sizeof(int)*num_of_nodes);
+  struct Node *h_graph_nodes =
+      (struct Node *)malloc(sizeof(struct Node) * num_of_nodes);
+  int *color = (int *)malloc(sizeof(int) * num_of_nodes);
   int start, edgeno;
   // initalize the memory
   int i;
-  for( i = 0; i < num_of_nodes; i++)
-  {
-    fscanf(fp,"%d %d",&start,&edgeno);
+  for (i = 0; i < num_of_nodes; i++) {
+    fscanf(fp, "%d %d", &start, &edgeno);
     h_graph_nodes[i].x = start;
     h_graph_nodes[i].y = edgeno;
-    color[i]=WHITE;
+    color[i] = WHITE;
   }
-  //read the source node from the file
-  fscanf(fp,"%d",&source);
-  fscanf(fp,"%d",&num_of_edges);
-  int id,cost;
-  struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges);
-  for(i=0; i < num_of_edges ; i++)
-  {
-    fscanf(fp,"%d",&id);
-    fscanf(fp,"%d",&cost);
+  // read the source node from the file
+  fscanf(fp, "%d", &source);
+  fscanf(fp, "%d", &num_of_edges);
+  int id, cost;
+  struct Edge *h_graph_edges =
+      (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges);
+  for (i = 0; i < num_of_edges; i++) {
+    fscanf(fp, "%d", &id);
+    fscanf(fp, "%d", &cost);
     h_graph_edges[i].x = id;
     h_graph_edges[i].y = cost;
   }
-  if(fp)
+  if (fp)
     fclose(fp);
 
   pb_InitializeTimerSet(&timers);
   // allocate mem for the result on host side
-  int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes);
-  for(i = 0; i < num_of_nodes; i++){
+  int *h_cost = (int *)malloc(sizeof(int) * num_of_nodes);
+  for (i = 0; i < num_of_nodes; i++) {
     h_cost[i] = INF;
   }
   h_cost[source] = 0;
@@ -151,17 +143,20 @@ int main( int argc, char** argv)
   cl_device_id clDevice;
   cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
   cl_platform_id clPlatform;
-  OCL_ERRCK_RETVAL(clGetPlatformIDs(1,&clPlatform,NULL));
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
+  OCL_ERRCK_RETVAL(clGetPlatformIDs(1, &clPlatform, NULL));
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
   int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0);
   if (deviceFound < 0) {
     fprintf(stderr, "No suitable device was found\n");
     exit(1);
   }
 
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
@@ -170,111 +165,151 @@ int main( int argc, char** argv)
   size_t program_length;
   const char *clSource_path = "src/opencl_base/kernel.cl";
   clSource = oclLoadProgSource(clSource_path, "", &program_length);
-  //printf("Program Source:\n%s\n", clSource);
-  cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&clSource, &program_length, &clStatus);
+  // printf("Program Source:\n%s\n", clSource);
+  cl_program clProgram = clCreateProgramWithSource(
+      clContext, 1, (const char **)&clSource, &program_length, &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_base");
-  OCL_ERRCK_RETVAL(clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL));
+  sprintf(clOptions, "-I src/opencl_base");
+  OCL_ERRCK_RETVAL(
+      clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL));
 
   // Uncomment to view build log from compiler for debugging
   /*
   char *build_log;
   size_t ret_val_size;
-  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-  build_log = (char *)malloc(ret_val_size+1);
-  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-  // there's no information in the reference whether the string is 0 terminated or not
-  build_log[ret_val_size] = '\0';
-  printf("%s\n", build_log );
+  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0,
+  NULL, &ret_val_size); build_log = (char *)malloc(ret_val_size+1); clStatus =
+  clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size,
+  build_log, NULL);
+  // there's no information in the reference whether the string is 0 terminated
+  or not build_log[ret_val_size] = '\0'; printf("%s\n", build_log );
   */
 
-  cl_kernel BFS_kernel = clCreateKernel(clProgram,"BFS_kernel",&clStatus);
+  cl_kernel BFS_kernel = clCreateKernel(clProgram, "BFS_kernel", &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
-  //Copy the Node list to device memory
+  // Copy the Node list to device memory
   cl_mem d_graph_nodes;
-  d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus);
+  d_graph_nodes =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     num_of_nodes * sizeof(struct Node), NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL));
-  //Copy the Edge List to device Memory
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_nodes, CL_TRUE,
+                                        0, num_of_nodes * sizeof(struct Node),
+                                        h_graph_nodes, 0, NULL, NULL));
+  // Copy the Edge List to device Memory
   cl_mem d_graph_edges;
-  d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus);
+  d_graph_edges =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     num_of_edges * sizeof(struct Edge), NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_edges, CL_TRUE,
+                                        0, num_of_edges * sizeof(struct Edge),
+                                        h_graph_edges, 0, NULL, NULL));
 
   cl_mem d_color, d_cost, d_q1, d_q2, tail;
-  d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
+  d_color = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                           num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_cost = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                          num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_q1 = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                        num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_q2 = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                        num_of_nodes * sizeof(int), NULL, &clStatus);
+  tail = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                        &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_color, CL_TRUE, 0,
+                                        num_of_nodes * sizeof(int), color, 0,
+                                        NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                        num_of_nodes * sizeof(int), h_cost, 0,
+                                        NULL, NULL));
 
   printf("Starting GPU kernel\n");
   pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
   int num_of_blocks;
   int num_of_threads_per_block;
 
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL));
-
-  int num_t;//number of threads
-  int k=0;//BFS level index
-
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,2,sizeof(cl_mem),(void*)&d_graph_nodes));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,3,sizeof(cl_mem),(void*)&d_graph_edges));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,4,sizeof(cl_mem),(void*)&d_color));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,5,sizeof(cl_mem),(void*)&d_cost));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,6,sizeof(cl_mem),(void*)&tail));
-
-  do
-  {
-    OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL));
-    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
-
-    if(num_t == 0){//frontier is empty
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                        sizeof(int), &h_top, 0, NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                        sizeof(int), &zero, 0, NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_q1, CL_TRUE, 0,
+                                        sizeof(int), &source, 0, NULL, NULL));
+
+  int num_t; // number of threads
+  int k = 0; // BFS level index
+
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 2, sizeof(cl_mem), (void *)&d_graph_nodes));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 3, sizeof(cl_mem), (void *)&d_graph_edges));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 4, sizeof(cl_mem), (void *)&d_color));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 5, sizeof(cl_mem), (void *)&d_cost));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 6, sizeof(cl_mem), (void *)&tail));
+
+  do {
+    OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                         sizeof(int), &num_t, 0, NULL, NULL));
+    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                          sizeof(int), &zero, 0, NULL, NULL));
+
+    if (num_t == 0) { // frontier is empty
       break;
     }
 
-    num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK);
-    num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;
+    num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK);
+    num_of_threads_per_block =
+        num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;
 
-    size_t grid[1] = {num_of_blocks*num_of_threads_per_block};
+    size_t grid[1] = {num_of_blocks * num_of_threads_per_block};
     size_t block[1] = {num_of_threads_per_block};
 
-
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,7,sizeof(int),(void*)&num_t));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,9,sizeof(int),(void*)&k));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,10,sizeof(int),NULL));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,11,LOCAL_MEM_SIZE*sizeof(int),NULL));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,12,sizeof(int),NULL));
-    if(k%2 == 0){
+    OCL_ERRCK_RETVAL(
+        clSetKernelArg(BFS_kernel, 7, sizeof(int), (void *)&num_t));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 9, sizeof(int), (void *)&k));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 10, sizeof(int), NULL));
+    OCL_ERRCK_RETVAL(
+        clSetKernelArg(BFS_kernel, 11, LOCAL_MEM_SIZE * sizeof(int), NULL));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 12, sizeof(int), NULL));
+    if (k % 2 == 0) {
       int gray = GRAY0;
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q1));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q2));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
-    }
-    else{
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q1));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q2));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray));
+    } else {
       int gray = GRAY1;
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q2));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q1));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q2));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q1));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray));
     }
-    OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel,1,0,grid,block,0,0,0));
+    OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel, 1, 0,
+                                            grid, block, 0, 0, 0));
     OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
     k++;
-  } while(1);
+  } while (1);
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
   printf("GPU kernel done\n");
 
   // copy result from device to host
-  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                       num_of_nodes * sizeof(int), h_cost, 0,
+                                       NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_color, CL_TRUE, 0,
+                                       num_of_nodes * sizeof(int), color, 0,
+                                       NULL, NULL));
 
   OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_nodes));
   OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_edges));
@@ -285,14 +320,13 @@ int main( int argc, char** argv)
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
-  
-  //Store the result into a file
-  //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  FILE *fp = fopen(params->outFile,"w");
+  // Store the result into a file
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  FILE *fp = fopen(params->outFile, "w");
   fprintf(fp, "%d\n", num_of_nodes);
   int j = 0;
-  for(j=0;j<num_of_nodes;j++)
-    fprintf(fp,"%d %d\n",j,h_cost[j]);
+  for (j = 0; j < num_of_nodes; j++)
+    fprintf(fp, "%d %d\n", j, h_cost[j]);
   fclose(fp);
   // cleanup memory
   free(h_graph_nodes);
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp
index 57368eda9ada364e6edf6e1eccd35758fa349b62..38e60a1cbff3d9e4ce8d56204e9213943ea4fd55 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.cpp
@@ -4,41 +4,47 @@
 #include <string.h>
 
 // -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
+int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device,
+                    cl_device_type *reqDeviceType, int numRequests, ...) {
+
+  // Supported Device Requests (anything that returns cl_bool)
+  //   CL_DEVICE_IMAGE_SUPPORT
+  //   CL_DEVICE_HOST_UNIFIED_MEMORY
+  //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
+  //   CL_DEVICE_AVAILABLE
+  //   CL_DEVICE_COMPILER_AVAILABLE
+
   cl_uint numEntries = 16;
   cl_platform_id clPlatforms[numEntries];
   cl_uint numPlatforms;
-  
+
   cl_device_id clDevices[numEntries];
   cl_uint numDevices;
 
-  OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
+  OCL_SIMPLE_ERRCK_RETVAL(
+      clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms));
   fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
   bool needDevice = true;
-  
+
   for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
 
     cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
+
+    OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL,
+                                           numEntries, clDevices, &numDevices));
+    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip,
+            numDevices);
+
+    for (int id = 0; (id < numDevices) && needDevice; ++id) {
       cl_device_id clDevice = clDevices[id];
       cl_device_type clDeviceType;
 
       bool canSatisfy = true;
-      
+
       if (reqDeviceType != NULL) {
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
+        OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceInfo(clDevice, CL_DEVICE_TYPE,
+                                                sizeof(cl_device_type),
+                                                &clDeviceType, NULL));
         if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
           if (*reqDeviceType != clDeviceType) {
             canSatisfy = false;
@@ -48,32 +54,34 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty
 
       va_list paramList;
       va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
+      for (int i = 0; (i < numRequests) && canSatisfy; ++i) {
+
+        cl_device_info devReq = va_arg(paramList, cl_device_info);
         cl_bool clInfoBool;
         size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
+
+        OCL_SIMPLE_ERRCK_RETVAL(
+            clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
         if (clInfoBool != true) {
           canSatisfy = false;
         }
       }
-      
+
       va_end(paramList);
       if (canSatisfy) {
         *device = clDevice;
         *platform = clPlatform;
         needDevice = false;
         fprintf(stderr, "Chose Device Type: %s\n",
-          (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"
-          );
+                (clDeviceType == CL_DEVICE_TYPE_CPU)
+                    ? "CPU"
+                    : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other");
         if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
           *reqDeviceType = clDeviceType;
         }
       }
     } // End checking all devices for a platform
-  } // End checking all platforms
+  }   // End checking all platforms
 
   int retVal = -1;
   if (needDevice) {
@@ -81,214 +89,213 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty
   } else {
     retVal = 0;
   }
-  
+
   return retVal;
 }
 
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
+const char *oclErrorString(cl_int error) {
+  // From NVIDIA SDK
+  static const char *errorString[] = {
+      "CL_SUCCESS",
+      "CL_DEVICE_NOT_FOUND",
+      "CL_DEVICE_NOT_AVAILABLE",
+      "CL_COMPILER_NOT_AVAILABLE",
+      "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+      "CL_OUT_OF_RESOURCES",
+      "CL_OUT_OF_HOST_MEMORY",
+      "CL_PROFILING_INFO_NOT_AVAILABLE",
+      "CL_MEM_COPY_OVERLAP",
+      "CL_IMAGE_FORMAT_MISMATCH",
+      "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+      "CL_BUILD_PROGRAM_FAILURE",
+      "CL_MAP_FAILURE",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "CL_INVALID_VALUE",
+      "CL_INVALID_DEVICE_TYPE",
+      "CL_INVALID_PLATFORM",
+      "CL_INVALID_DEVICE",
+      "CL_INVALID_CONTEXT",
+      "CL_INVALID_QUEUE_PROPERTIES",
+      "CL_INVALID_COMMAND_QUEUE",
+      "CL_INVALID_HOST_PTR",
+      "CL_INVALID_MEM_OBJECT",
+      "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+      "CL_INVALID_IMAGE_SIZE",
+      "CL_INVALID_SAMPLER",
+      "CL_INVALID_BINARY",
+      "CL_INVALID_BUILD_OPTIONS",
+      "CL_INVALID_PROGRAM",
+      "CL_INVALID_PROGRAM_EXECUTABLE",
+      "CL_INVALID_KERNEL_NAME",
+      "CL_INVALID_KERNEL_DEFINITION",
+      "CL_INVALID_KERNEL",
+      "CL_INVALID_ARG_INDEX",
+      "CL_INVALID_ARG_VALUE",
+      "CL_INVALID_ARG_SIZE",
+      "CL_INVALID_KERNEL_ARGS",
+      "CL_INVALID_WORK_DIMENSION",
+      "CL_INVALID_WORK_GROUP_SIZE",
+      "CL_INVALID_WORK_ITEM_SIZE",
+      "CL_INVALID_GLOBAL_OFFSET",
+      "CL_INVALID_EVENT_WAIT_LIST",
+      "CL_INVALID_EVENT",
+      "CL_INVALID_OPERATION",
+      "CL_INVALID_GL_OBJECT",
+      "CL_INVALID_BUFFER_SIZE",
+      "CL_INVALID_MIP_LEVEL",
+      "CL_INVALID_GLOBAL_WORK_SIZE",
+  };
+
+  const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
+
+  const int index = -error;
+
+  return (index >= 0 && index < errorCount) ? errorString[index] : "";
 }
 
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	  cl_uint maxMemAlloc = 0;
-	  OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
+const char *oclDebugErrString(cl_int error, cl_device_id device) {
+  // From NVIDIA SDK
+  static const char *errorString[] = {
+      "CL_SUCCESS",
+      "CL_DEVICE_NOT_FOUND",
+      "CL_DEVICE_NOT_AVAILABLE",
+      "CL_COMPILER_NOT_AVAILABLE",
+      "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+      "CL_OUT_OF_RESOURCES",
+      "CL_OUT_OF_HOST_MEMORY",
+      "CL_PROFILING_INFO_NOT_AVAILABLE",
+      "CL_MEM_COPY_OVERLAP",
+      "CL_IMAGE_FORMAT_MISMATCH",
+      "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+      "CL_BUILD_PROGRAM_FAILURE",
+      "CL_MAP_FAILURE",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "CL_INVALID_VALUE",
+      "CL_INVALID_DEVICE_TYPE",
+      "CL_INVALID_PLATFORM",
+      "CL_INVALID_DEVICE",
+      "CL_INVALID_CONTEXT",
+      "CL_INVALID_QUEUE_PROPERTIES",
+      "CL_INVALID_COMMAND_QUEUE",
+      "CL_INVALID_HOST_PTR",
+      "CL_INVALID_MEM_OBJECT",
+      "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+      "CL_INVALID_IMAGE_SIZE",
+      "CL_INVALID_SAMPLER",
+      "CL_INVALID_BINARY",
+      "CL_INVALID_BUILD_OPTIONS",
+      "CL_INVALID_PROGRAM",
+      "CL_INVALID_PROGRAM_EXECUTABLE",
+      "CL_INVALID_KERNEL_NAME",
+      "CL_INVALID_KERNEL_DEFINITION",
+      "CL_INVALID_KERNEL",
+      "CL_INVALID_ARG_INDEX",
+      "CL_INVALID_ARG_VALUE",
+      "CL_INVALID_ARG_SIZE",
+      "CL_INVALID_KERNEL_ARGS",
+      "CL_INVALID_WORK_DIMENSION",
+      "CL_INVALID_WORK_GROUP_SIZE",
+      "CL_INVALID_WORK_ITEM_SIZE",
+      "CL_INVALID_GLOBAL_OFFSET",
+      "CL_INVALID_EVENT_WAIT_LIST",
+      "CL_INVALID_EVENT",
+      "CL_INVALID_OPERATION",
+      "CL_INVALID_GL_OBJECT",
+      "CL_INVALID_BUFFER_SIZE",
+      "CL_INVALID_MIP_LEVEL",
+      "CL_INVALID_GLOBAL_WORK_SIZE",
+  };
+
+  const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
+
+  const int index = -error;
+
+  if (index == 4) {
+    cl_uint maxMemAlloc = 0;
+    OCL_SIMPLE_ERRCK_RETVAL(
+        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong),
+                        &maxMemAlloc, NULL));
+    fprintf(stderr, "  Device Maximum block allocation size: %lu\n",
+            maxMemAlloc);
+  }
+
+  return (index >= 0 && index < errorCount) ? errorString[index] : "";
 }
 
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
+char *oclLoadProgSource(const char *cFilename, const char *cPreamble,
+                        size_t *szFinalLength) {
+  // locals
+  FILE *pFileStream = NULL;
+  size_t szSourceLength;
+
+// open the OpenCL source code file
+#ifdef _WIN32 // Windows version
+  if (fopen_s(&pFileStream, cFilename, "rb") != 0) {
+    return NULL;
+  }
+#else // Linux version
+  pFileStream = fopen(cFilename, "rb");
+  if (pFileStream == 0) {
+    return NULL;
+  }
+#endif
+
+  size_t szPreambleLength = strlen(cPreamble);
+
+  // get the length of the source code
+  fseek(pFileStream, 0, SEEK_END);
+  szSourceLength = ftell(pFileStream);
+  fseek(pFileStream, 0, SEEK_SET);
+
+  // allocate a buffer for the source code string and read it in
+  char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
+  memcpy(cSourceString, cPreamble, szPreambleLength);
+  if (fread((cSourceString) + szPreambleLength, szSourceLength, 1,
+            pFileStream) != 1) {
     fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
+    free(cSourceString);
+    return 0;
+  }
+
+  // close the file and return the total length of the combined (preamble +
+  // source) string
+  fclose(pFileStream);
+  if (szFinalLength != 0) {
+    *szFinalLength = szSourceLength + szPreambleLength;
+  }
+  cSourceString[szSourceLength + szPreambleLength] = '\0';
 
-    return cSourceString;
+  return cSourceString;
 }
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h
index 976c692055501532d65a1ac25e74630732fd2a86..27b084487c6289196337ca064b94f1353f8bbbad 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/OpenCL_common.h
@@ -2,26 +2,40 @@
 #ifndef __OPENCL_COMMON_H_
 #define __OPENCL_COMMON_H_
 
-#include <stdio.h>
+#include <CL/cl.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
-#include <CL/cl.h>
 
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); }
-    
-#define OCL_SIMPLE_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device,
+                    cl_device_type *reqDeviceType, int numRequests, ...);
+const char *oclErrorString(cl_int error);
+const char *oclDebugErrString(cl_int error, cl_device_id device);
+
+#define OCL_ERRCK_VAR(var)                                                     \
+  {                                                                            \
+    if (var != CL_SUCCESS)                                                     \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclErrorString(var));                                            \
+  }
+
+#define OCL_ERRCK_RETVAL(s)                                                    \
+  {                                                                            \
+    cl_int clerr = (s);                                                        \
+    if (clerr != CL_SUCCESS)                                                   \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclDebugErrString(clerr, clDevice));                             \
+  }
+
+#define OCL_SIMPLE_ERRCK_RETVAL(s)                                             \
+  {                                                                            \
+    cl_int clerr = (s);                                                        \
+    if (clerr != CL_SUCCESS)                                                   \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclErrorString(clerr));                                          \
+  }
+
+char *oclLoadProgSource(const char *cFilename, const char *cPreamble,
+                        size_t *szFinalLength);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h
index 1a00ef98e054e50e654b0a52ccbb05ce136bab27..f9cdb59e9cd6cc39364fd9389ee39216646aedb2 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/config.h
@@ -1,7 +1,8 @@
 #define MAX_THREADS_PER_BLOCK 256
-#define LOCAL_MEM_SIZE 1600 //This needs to be adjusted for certain graphs with high degrees
-#define INF 2147483647//2^31-1
-#define UP_LIMIT 16677216//2^24
+#define LOCAL_MEM_SIZE                                                         \
+  1600 // This needs to be adjusted for certain graphs with high degrees
+#define INF 2147483647    // 2^31-1
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp
index 8e021463567b304f384993052692668559166fe6..9b8b502688abb01934b337bc7fb178b32fda4633 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_cpu_baseline/main.cpp
@@ -12,61 +12,56 @@
   Copyright (c) 2010 University of Illinois at Urbana-Champaign.
   All rights reserved.
 
-  Permission to use, copy, modify and distribute this software and its documentation for
-  educational purpose is hereby granted without fee, provided that the above copyright
-  notice and this permission notice appear in all copies of this software and that you do
-  not sell the software.
+  Permission to use, copy, modify and distribute this software and its
+  documentation for educational purpose is hereby granted without fee, provided
+  that the above copyright notice and this permission notice appear in all
+  copies of this software and that you do not sell the software.
 
-  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
-  OTHERWISE.
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS,
+  IMPLIED OR OTHERWISE.
 
   Author: Lijiuan Luo (lluo3@uiuc.edu)
-  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu)
+  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu
+  (gengliu2@illinois.edu)
 */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <CL/cl.h>
-#include "parboil.h"
 #include "OpenCL_common.h"
 #include "config.h"
+#include "parboil.h"
+#include <CL/cl.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
-
-#define CHECK_ERROR(errorMessage)        \
-if(clStatus != CL_SUCCESS)               \
-{                                        \
-  printf("Error: %s!\n",errorMessage);   \
-  printf("Line: %d\n",__LINE__);         \
-  exit(1);                               \
-}
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
+  }
 
 FILE *fp;
-char* readFile(const char* fileName)
-{
-  FILE* fp;
-  fp = fopen(fileName,"r");
-  if(fp == NULL)
-  {
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
     printf("Error 1!\n");
     exit(1);
   }
 
-  fseek(fp,0,SEEK_END);
+  fseek(fp, 0, SEEK_END);
   long size = ftell(fp);
   rewind(fp);
 
-  char* buffer = (char*)malloc(sizeof(char)*size);
-  if(buffer  == NULL)
-  {
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
     printf("Error 2!\n");
     fclose(fp);
     exit(1);
   }
 
-  size_t res = fread(buffer,1,size,fp);
-  if(res != size)
-  {
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
     printf("Error 3!\n");
     fclose(fp);
     exit(1);
@@ -77,97 +72,98 @@ char* readFile(const char* fileName)
 }
 const int h_top = 1;
 const int zero = 0;
-void runGPU(int argc, char** argv);
+void runGPU(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
-int main( int argc, char** argv)
-{
+int main(int argc, char **argv) {
 
-  //the number of nodes in the graph
+  // the number of nodes in the graph
   int num_of_nodes = 0;
-  //the number of edges in the graph
+  // the number of edges in the graph
   int num_of_edges = 0;
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
 
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-  {
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) {
     fprintf(stderr, "Expecting one input filename\n");
     exit(-1);
   }
 
-  //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  //Read in Graph from a file
-  fp = fopen(params->inpFiles[0],"r");
-  if(!fp)
-  {
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // Read in Graph from a file
+  fp = fopen(params->inpFiles[0], "r");
+  if (!fp) {
     printf("Error Reading graph file\n");
     return 0;
   }
   int source;
 
-  fscanf(fp,"%d",&num_of_nodes);
+  fscanf(fp, "%d", &num_of_nodes);
   // allocate host memory
-  struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes);
-  int *color = (int*) malloc(sizeof(int)*num_of_nodes);
+  struct Node *h_graph_nodes =
+      (struct Node *)malloc(sizeof(struct Node) * num_of_nodes);
+  int *color = (int *)malloc(sizeof(int) * num_of_nodes);
   int start, edgeno;
   // initalize the memory
   int i;
-  for( i = 0; i < num_of_nodes; i++)
-  {
-    fscanf(fp,"%d %d",&start,&edgeno);
+  for (i = 0; i < num_of_nodes; i++) {
+    fscanf(fp, "%d %d", &start, &edgeno);
     h_graph_nodes[i].x = start;
     h_graph_nodes[i].y = edgeno;
-    color[i]=WHITE;
+    color[i] = WHITE;
   }
-  //read the source node from the file
-  fscanf(fp,"%d",&source);
-  fscanf(fp,"%d",&num_of_edges);
-  int id,cost;
-  struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges);
-  for(i=0; i < num_of_edges ; i++)
-  {
-    fscanf(fp,"%d",&id);
-    fscanf(fp,"%d",&cost);
+  // read the source node from the file
+  fscanf(fp, "%d", &source);
+  fscanf(fp, "%d", &num_of_edges);
+  int id, cost;
+  struct Edge *h_graph_edges =
+      (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges);
+  for (i = 0; i < num_of_edges; i++) {
+    fscanf(fp, "%d", &id);
+    fscanf(fp, "%d", &cost);
     h_graph_edges[i].x = id;
     h_graph_edges[i].y = cost;
   }
-  if(fp)
+  if (fp)
     fclose(fp);
 
   pb_InitializeTimerSet(&timers);
   // allocate mem for the result on host side
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes);
-  for(i = 0; i < num_of_nodes; i++){
+  int *h_cost = (int *)malloc(sizeof(int) * num_of_nodes);
+  for (i = 0; i < num_of_nodes; i++) {
     h_cost[i] = INF;
   }
   h_cost[source] = 0;
 
-  //pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
   cl_int clStatus;
   cl_device_id clDevice;
   cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
 
   cl_uint numPlatforms;
-  clStatus  = clGetPlatformIDs(0, NULL, &numPlatforms);
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
   clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  cl_context clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-  
+
   OCL_ERRCK_VAR(clStatus);
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
@@ -176,117 +172,157 @@ int main( int argc, char** argv)
   size_t program_length;
   const char *clSource_path = "src/opencl_cpu_baseline/kernel.cl";
   clSource = oclLoadProgSource(clSource_path, "", &program_length);
-  //printf("Program Source:\n%s\n", clSource);
-  cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&clSource, &program_length, &clStatus);
+  // printf("Program Source:\n%s\n", clSource);
+  cl_program clProgram = clCreateProgramWithSource(
+      clContext, 1, (const char **)&clSource, &program_length, &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_base");
-  OCL_ERRCK_RETVAL(clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL));
+  sprintf(clOptions, "-I src/opencl_base");
+  OCL_ERRCK_RETVAL(
+      clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL));
 
   // Uncomment to view build log from compiler for debugging
   /*
   char *build_log;
   size_t ret_val_size;
-  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
-  build_log = (char *)malloc(ret_val_size+1);
-  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-  // there's no information in the reference whether the string is 0 terminated or not
-  build_log[ret_val_size] = '\0';
-  printf("%s\n", build_log );
+  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0,
+  NULL, &ret_val_size); build_log = (char *)malloc(ret_val_size+1); clStatus =
+  clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size,
+  build_log, NULL);
+  // there's no information in the reference whether the string is 0 terminated
+  or not build_log[ret_val_size] = '\0'; printf("%s\n", build_log );
   */
 
-  cl_kernel BFS_kernel = clCreateKernel(clProgram,"BFS_kernel",&clStatus);
+  cl_kernel BFS_kernel = clCreateKernel(clProgram, "BFS_kernel", &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  //Copy the Node list to device memory
+  // Copy the Node list to device memory
   cl_mem d_graph_nodes;
-  d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus);
+  d_graph_nodes =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     num_of_nodes * sizeof(struct Node), NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL));
-  //Copy the Edge List to device Memory
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_nodes, CL_TRUE,
+                                        0, num_of_nodes * sizeof(struct Node),
+                                        h_graph_nodes, 0, NULL, NULL));
+  // Copy the Edge List to device Memory
   cl_mem d_graph_edges;
-  d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus);
+  d_graph_edges =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     num_of_edges * sizeof(struct Edge), NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_edges, CL_TRUE,
+                                        0, num_of_edges * sizeof(struct Edge),
+                                        h_graph_edges, 0, NULL, NULL));
 
   cl_mem d_color, d_cost, d_q1, d_q2, tail;
-  d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
+  d_color = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                           num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_cost = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                          num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_q1 = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                        num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_q2 = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                        num_of_nodes * sizeof(int), NULL, &clStatus);
+  tail = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                        &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_color, CL_TRUE, 0,
+                                        num_of_nodes * sizeof(int), color, 0,
+                                        NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                        num_of_nodes * sizeof(int), h_cost, 0,
+                                        NULL, NULL));
 
   printf("Starting GPU kernel\n");
   pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
   int num_of_blocks;
   int num_of_threads_per_block;
 
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL));
-  
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                        sizeof(int), &h_top, 0, NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                        sizeof(int), &zero, 0, NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_q1, CL_TRUE, 0,
+                                        sizeof(int), &source, 0, NULL, NULL));
+
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-  int num_t;//number of threads
-  int k=0;//BFS level index
+  int num_t; // number of threads
+  int k = 0; // BFS level index
+
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 2, sizeof(cl_mem), (void *)&d_graph_nodes));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 3, sizeof(cl_mem), (void *)&d_graph_edges));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 4, sizeof(cl_mem), (void *)&d_color));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 5, sizeof(cl_mem), (void *)&d_cost));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel, 6, sizeof(cl_mem), (void *)&tail));
 
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,2,sizeof(cl_mem),(void*)&d_graph_nodes));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,3,sizeof(cl_mem),(void*)&d_graph_edges));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,4,sizeof(cl_mem),(void*)&d_color));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,5,sizeof(cl_mem),(void*)&d_cost));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,6,sizeof(cl_mem),(void*)&tail));
+  do {
 
-  do
-  {
-    
     pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL));
-    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
+    OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                         sizeof(int), &num_t, 0, NULL, NULL));
+    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                          sizeof(int), &zero, 0, NULL, NULL));
 
     pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    if(num_t == 0){//frontier is empty
+    if (num_t == 0) { // frontier is empty
       break;
     }
 
-    num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK);
-    num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;
+    num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK);
+    num_of_threads_per_block =
+        num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;
 
-    size_t grid[1] = {num_of_blocks*num_of_threads_per_block};
+    size_t grid[1] = {num_of_blocks * num_of_threads_per_block};
     size_t block[1] = {num_of_threads_per_block};
 
-
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,7,sizeof(int),(void*)&num_t));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,9,sizeof(int),(void*)&k));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,10,sizeof(int),NULL));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,11,LOCAL_MEM_SIZE*sizeof(int),NULL));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,12,sizeof(int),NULL));
-    if(k%2 == 0){
+    OCL_ERRCK_RETVAL(
+        clSetKernelArg(BFS_kernel, 7, sizeof(int), (void *)&num_t));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 9, sizeof(int), (void *)&k));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 10, sizeof(int), NULL));
+    OCL_ERRCK_RETVAL(
+        clSetKernelArg(BFS_kernel, 11, LOCAL_MEM_SIZE * sizeof(int), NULL));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel, 12, sizeof(int), NULL));
+    if (k % 2 == 0) {
       int gray = GRAY0;
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q1));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q2));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
-    }
-    else{
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q1));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q2));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray));
+    } else {
       int gray = GRAY1;
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,0,sizeof(cl_mem),(void*)&d_q2));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,1,sizeof(cl_mem),(void*)&d_q1));
-      OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel,8,sizeof(int),(void*)&gray));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 0, sizeof(cl_mem), (void *)&d_q2));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 1, sizeof(cl_mem), (void *)&d_q1));
+      OCL_ERRCK_RETVAL(
+          clSetKernelArg(BFS_kernel, 8, sizeof(int), (void *)&gray));
     }
-    OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel,1,0,grid,block,0,0,0));
+    OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel, 1, 0,
+                                            grid, block, 0, 0, 0));
     OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
     k++;
-  } while(1);
+  } while (1);
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  //printf("GPU kernel done\n");
+  // printf("GPU kernel done\n");
 
   // copy result from device to host
-  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                       num_of_nodes * sizeof(int), h_cost, 0,
+                                       NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_color, CL_TRUE, 0,
+                                       num_of_nodes * sizeof(int), color, 0,
+                                       NULL, NULL));
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
@@ -299,14 +335,13 @@ int main( int argc, char** argv)
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
-  
-  //Store the result into a file
-  //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  FILE *fp = fopen(params->outFile,"w");
+  // Store the result into a file
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  FILE *fp = fopen(params->outFile, "w");
   fprintf(fp, "%d\n", num_of_nodes);
   int j = 0;
-  for(j=0;j<num_of_nodes;j++)
-    fprintf(fp,"%d %d\n",j,h_cost[j]);
+  for (j = 0; j < num_of_nodes; j++)
+    fprintf(fp, "%d %d\n", j, h_cost[j]);
   fclose(fp);
   // cleanup memory
   free(h_graph_nodes);
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp
index 57368eda9ada364e6edf6e1eccd35758fa349b62..38e60a1cbff3d9e4ce8d56204e9213943ea4fd55 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.cpp
@@ -4,41 +4,47 @@
 #include <string.h>
 
 // -1 for NO suitable device found, 0 if an appropriate device was found
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
-      
-        // Supported Device Requests (anything that returns cl_bool)
-        //   CL_DEVICE_IMAGE_SUPPORT
-        //   CL_DEVICE_HOST_UNIFIED_MEMORY
-        //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
-        //   CL_DEVICE_AVAILABLE
-        //   CL_DEVICE_COMPILER_AVAILABLE
-  
+int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device,
+                    cl_device_type *reqDeviceType, int numRequests, ...) {
+
+  // Supported Device Requests (anything that returns cl_bool)
+  //   CL_DEVICE_IMAGE_SUPPORT
+  //   CL_DEVICE_HOST_UNIFIED_MEMORY
+  //   CL_DEVICE_ERROR_CORRECTION_SUPPORT
+  //   CL_DEVICE_AVAILABLE
+  //   CL_DEVICE_COMPILER_AVAILABLE
+
   cl_uint numEntries = 16;
   cl_platform_id clPlatforms[numEntries];
   cl_uint numPlatforms;
-  
+
   cl_device_id clDevices[numEntries];
   cl_uint numDevices;
 
-  OCL_SIMPLE_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
+  OCL_SIMPLE_ERRCK_RETVAL(
+      clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms));
   fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
   bool needDevice = true;
-  
+
   for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
 
     cl_platform_id clPlatform = clPlatforms[ip];
-    
-    OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
-    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip, numDevices);
-    
-    for (int id = 0; (id < numDevices) && needDevice ; ++id) {
+
+    OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL,
+                                           numEntries, clDevices, &numDevices));
+    fprintf(stderr, "  Number of Devices found for Platform %d: %d\n", ip,
+            numDevices);
+
+    for (int id = 0; (id < numDevices) && needDevice; ++id) {
       cl_device_id clDevice = clDevices[id];
       cl_device_type clDeviceType;
 
       bool canSatisfy = true;
-      
+
       if (reqDeviceType != NULL) {
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
+        OCL_SIMPLE_ERRCK_RETVAL(clGetDeviceInfo(clDevice, CL_DEVICE_TYPE,
+                                                sizeof(cl_device_type),
+                                                &clDeviceType, NULL));
         if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
           if (*reqDeviceType != clDeviceType) {
             canSatisfy = false;
@@ -48,32 +54,34 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty
 
       va_list paramList;
       va_start(paramList, numRequests);
-      for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
-      
-        cl_device_info devReq = va_arg( paramList, cl_device_info );  
+      for (int i = 0; (i < numRequests) && canSatisfy; ++i) {
+
+        cl_device_info devReq = va_arg(paramList, cl_device_info);
         cl_bool clInfoBool;
         size_t infoRetSize = sizeof(cl_bool);
-        
-        OCL_SIMPLE_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
+
+        OCL_SIMPLE_ERRCK_RETVAL(
+            clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
         if (clInfoBool != true) {
           canSatisfy = false;
         }
       }
-      
+
       va_end(paramList);
       if (canSatisfy) {
         *device = clDevice;
         *platform = clPlatform;
         needDevice = false;
         fprintf(stderr, "Chose Device Type: %s\n",
-          (clDeviceType == CL_DEVICE_TYPE_CPU) ? "CPU" : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other"
-          );
+                (clDeviceType == CL_DEVICE_TYPE_CPU)
+                    ? "CPU"
+                    : (clDeviceType == CL_DEVICE_TYPE_GPU) ? "GPU" : "other");
         if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
           *reqDeviceType = clDeviceType;
         }
       }
     } // End checking all devices for a platform
-  } // End checking all platforms
+  }   // End checking all platforms
 
   int retVal = -1;
   if (needDevice) {
@@ -81,214 +89,213 @@ int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_ty
   } else {
     retVal = 0;
   }
-  
+
   return retVal;
 }
 
-const char* oclErrorString(cl_int error)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
+const char *oclErrorString(cl_int error) {
+  // From NVIDIA SDK
+  static const char *errorString[] = {
+      "CL_SUCCESS",
+      "CL_DEVICE_NOT_FOUND",
+      "CL_DEVICE_NOT_AVAILABLE",
+      "CL_COMPILER_NOT_AVAILABLE",
+      "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+      "CL_OUT_OF_RESOURCES",
+      "CL_OUT_OF_HOST_MEMORY",
+      "CL_PROFILING_INFO_NOT_AVAILABLE",
+      "CL_MEM_COPY_OVERLAP",
+      "CL_IMAGE_FORMAT_MISMATCH",
+      "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+      "CL_BUILD_PROGRAM_FAILURE",
+      "CL_MAP_FAILURE",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "CL_INVALID_VALUE",
+      "CL_INVALID_DEVICE_TYPE",
+      "CL_INVALID_PLATFORM",
+      "CL_INVALID_DEVICE",
+      "CL_INVALID_CONTEXT",
+      "CL_INVALID_QUEUE_PROPERTIES",
+      "CL_INVALID_COMMAND_QUEUE",
+      "CL_INVALID_HOST_PTR",
+      "CL_INVALID_MEM_OBJECT",
+      "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+      "CL_INVALID_IMAGE_SIZE",
+      "CL_INVALID_SAMPLER",
+      "CL_INVALID_BINARY",
+      "CL_INVALID_BUILD_OPTIONS",
+      "CL_INVALID_PROGRAM",
+      "CL_INVALID_PROGRAM_EXECUTABLE",
+      "CL_INVALID_KERNEL_NAME",
+      "CL_INVALID_KERNEL_DEFINITION",
+      "CL_INVALID_KERNEL",
+      "CL_INVALID_ARG_INDEX",
+      "CL_INVALID_ARG_VALUE",
+      "CL_INVALID_ARG_SIZE",
+      "CL_INVALID_KERNEL_ARGS",
+      "CL_INVALID_WORK_DIMENSION",
+      "CL_INVALID_WORK_GROUP_SIZE",
+      "CL_INVALID_WORK_ITEM_SIZE",
+      "CL_INVALID_GLOBAL_OFFSET",
+      "CL_INVALID_EVENT_WAIT_LIST",
+      "CL_INVALID_EVENT",
+      "CL_INVALID_OPERATION",
+      "CL_INVALID_GL_OBJECT",
+      "CL_INVALID_BUFFER_SIZE",
+      "CL_INVALID_MIP_LEVEL",
+      "CL_INVALID_GLOBAL_WORK_SIZE",
+  };
+
+  const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
+
+  const int index = -error;
+
+  return (index >= 0 && index < errorCount) ? errorString[index] : "";
 }
 
-const char* oclDebugErrString(cl_int error, cl_device_id device)
-{
-// From NVIDIA SDK
-	static const char* errorString[] = {
-		"CL_SUCCESS",
-		"CL_DEVICE_NOT_FOUND",
-		"CL_DEVICE_NOT_AVAILABLE",
-		"CL_COMPILER_NOT_AVAILABLE",
-		"CL_MEM_OBJECT_ALLOCATION_FAILURE",
-		"CL_OUT_OF_RESOURCES",
-		"CL_OUT_OF_HOST_MEMORY",
-		"CL_PROFILING_INFO_NOT_AVAILABLE",
-		"CL_MEM_COPY_OVERLAP",
-		"CL_IMAGE_FORMAT_MISMATCH",
-		"CL_IMAGE_FORMAT_NOT_SUPPORTED",
-		"CL_BUILD_PROGRAM_FAILURE",
-		"CL_MAP_FAILURE",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"",
-		"CL_INVALID_VALUE",
-		"CL_INVALID_DEVICE_TYPE",
-		"CL_INVALID_PLATFORM",
-		"CL_INVALID_DEVICE",
-		"CL_INVALID_CONTEXT",
-		"CL_INVALID_QUEUE_PROPERTIES",
-		"CL_INVALID_COMMAND_QUEUE",
-		"CL_INVALID_HOST_PTR",
-		"CL_INVALID_MEM_OBJECT",
-		"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-		"CL_INVALID_IMAGE_SIZE",
-		"CL_INVALID_SAMPLER",
-		"CL_INVALID_BINARY",
-		"CL_INVALID_BUILD_OPTIONS",
-		"CL_INVALID_PROGRAM",
-		"CL_INVALID_PROGRAM_EXECUTABLE",
-		"CL_INVALID_KERNEL_NAME",
-		"CL_INVALID_KERNEL_DEFINITION",
-		"CL_INVALID_KERNEL",
-		"CL_INVALID_ARG_INDEX",
-		"CL_INVALID_ARG_VALUE",
-		"CL_INVALID_ARG_SIZE",
-		"CL_INVALID_KERNEL_ARGS",
-		"CL_INVALID_WORK_DIMENSION",
-		"CL_INVALID_WORK_GROUP_SIZE",
-		"CL_INVALID_WORK_ITEM_SIZE",
-		"CL_INVALID_GLOBAL_OFFSET",
-		"CL_INVALID_EVENT_WAIT_LIST",
-		"CL_INVALID_EVENT",
-		"CL_INVALID_OPERATION",
-		"CL_INVALID_GL_OBJECT",
-		"CL_INVALID_BUFFER_SIZE",
-		"CL_INVALID_MIP_LEVEL",
-		"CL_INVALID_GLOBAL_WORK_SIZE",
-	};
-
-	const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-	const int index = -error;
-	
-	if (index == 4) {
-	  cl_uint maxMemAlloc = 0;
-	  OCL_SIMPLE_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
-	  fprintf(stderr, "  Device Maximum block allocation size: %lu\n", maxMemAlloc);
-	}
-
-	return (index >= 0 && index < errorCount) ? errorString[index] : "";
+const char *oclDebugErrString(cl_int error, cl_device_id device) {
+  // From NVIDIA SDK
+  static const char *errorString[] = {
+      "CL_SUCCESS",
+      "CL_DEVICE_NOT_FOUND",
+      "CL_DEVICE_NOT_AVAILABLE",
+      "CL_COMPILER_NOT_AVAILABLE",
+      "CL_MEM_OBJECT_ALLOCATION_FAILURE",
+      "CL_OUT_OF_RESOURCES",
+      "CL_OUT_OF_HOST_MEMORY",
+      "CL_PROFILING_INFO_NOT_AVAILABLE",
+      "CL_MEM_COPY_OVERLAP",
+      "CL_IMAGE_FORMAT_MISMATCH",
+      "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+      "CL_BUILD_PROGRAM_FAILURE",
+      "CL_MAP_FAILURE",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "",
+      "CL_INVALID_VALUE",
+      "CL_INVALID_DEVICE_TYPE",
+      "CL_INVALID_PLATFORM",
+      "CL_INVALID_DEVICE",
+      "CL_INVALID_CONTEXT",
+      "CL_INVALID_QUEUE_PROPERTIES",
+      "CL_INVALID_COMMAND_QUEUE",
+      "CL_INVALID_HOST_PTR",
+      "CL_INVALID_MEM_OBJECT",
+      "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+      "CL_INVALID_IMAGE_SIZE",
+      "CL_INVALID_SAMPLER",
+      "CL_INVALID_BINARY",
+      "CL_INVALID_BUILD_OPTIONS",
+      "CL_INVALID_PROGRAM",
+      "CL_INVALID_PROGRAM_EXECUTABLE",
+      "CL_INVALID_KERNEL_NAME",
+      "CL_INVALID_KERNEL_DEFINITION",
+      "CL_INVALID_KERNEL",
+      "CL_INVALID_ARG_INDEX",
+      "CL_INVALID_ARG_VALUE",
+      "CL_INVALID_ARG_SIZE",
+      "CL_INVALID_KERNEL_ARGS",
+      "CL_INVALID_WORK_DIMENSION",
+      "CL_INVALID_WORK_GROUP_SIZE",
+      "CL_INVALID_WORK_ITEM_SIZE",
+      "CL_INVALID_GLOBAL_OFFSET",
+      "CL_INVALID_EVENT_WAIT_LIST",
+      "CL_INVALID_EVENT",
+      "CL_INVALID_OPERATION",
+      "CL_INVALID_GL_OBJECT",
+      "CL_INVALID_BUFFER_SIZE",
+      "CL_INVALID_MIP_LEVEL",
+      "CL_INVALID_GLOBAL_WORK_SIZE",
+  };
+
+  const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
+
+  const int index = -error;
+
+  if (index == 4) {
+    cl_uint maxMemAlloc = 0;
+    OCL_SIMPLE_ERRCK_RETVAL(
+        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong),
+                        &maxMemAlloc, NULL));
+    fprintf(stderr, "  Device Maximum block allocation size: %lu\n",
+            maxMemAlloc);
+  }
+
+  return (index >= 0 && index < errorCount) ? errorString[index] : "";
 }
 
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
+char *oclLoadProgSource(const char *cFilename, const char *cPreamble,
+                        size_t *szFinalLength) {
+  // locals
+  FILE *pFileStream = NULL;
+  size_t szSourceLength;
+
+// open the OpenCL source code file
+#ifdef _WIN32 // Windows version
+  if (fopen_s(&pFileStream, cFilename, "rb") != 0) {
+    return NULL;
+  }
+#else // Linux version
+  pFileStream = fopen(cFilename, "rb");
+  if (pFileStream == 0) {
+    return NULL;
+  }
+#endif
+
+  size_t szPreambleLength = strlen(cPreamble);
+
+  // get the length of the source code
+  fseek(pFileStream, 0, SEEK_END);
+  szSourceLength = ftell(pFileStream);
+  fseek(pFileStream, 0, SEEK_SET);
+
+  // allocate a buffer for the source code string and read it in
+  char *cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
+  memcpy(cSourceString, cPreamble, szPreambleLength);
+  if (fread((cSourceString) + szPreambleLength, szSourceLength, 1,
+            pFileStream) != 1) {
     fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
+    free(cSourceString);
+    return 0;
+  }
+
+  // close the file and return the total length of the combined (preamble +
+  // source) string
+  fclose(pFileStream);
+  if (szFinalLength != 0) {
+    *szFinalLength = szSourceLength + szPreambleLength;
+  }
+  cSourceString[szSourceLength + szPreambleLength] = '\0';
 
-    return cSourceString;
+  return cSourceString;
 }
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h
index 976c692055501532d65a1ac25e74630732fd2a86..27b084487c6289196337ca064b94f1353f8bbbad 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/OpenCL_common.h
@@ -2,26 +2,40 @@
 #ifndef __OPENCL_COMMON_H_
 #define __OPENCL_COMMON_H_
 
-#include <stdio.h>
+#include <CL/cl.h>
 #include <stdarg.h>
+#include <stdio.h>
 #include <string.h>
-#include <CL/cl.h>
 
-int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
-const char* oclErrorString(cl_int error);
-const char* oclDebugErrString(cl_int error, cl_device_id device);
-
-#define OCL_ERRCK_VAR(var) \
-  { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }  
-  
-#define OCL_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclDebugErrString(clerr, clDevice)); }
-    
-#define OCL_SIMPLE_ERRCK_RETVAL(s) \
-  { cl_int clerr = (s);\
-    if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
-
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device,
+                    cl_device_type *reqDeviceType, int numRequests, ...);
+const char *oclErrorString(cl_int error);
+const char *oclDebugErrString(cl_int error, cl_device_id device);
+
+#define OCL_ERRCK_VAR(var)                                                     \
+  {                                                                            \
+    if (var != CL_SUCCESS)                                                     \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclErrorString(var));                                            \
+  }
+
+#define OCL_ERRCK_RETVAL(s)                                                    \
+  {                                                                            \
+    cl_int clerr = (s);                                                        \
+    if (clerr != CL_SUCCESS)                                                   \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclDebugErrString(clerr, clDevice));                             \
+  }
+
+#define OCL_SIMPLE_ERRCK_RETVAL(s)                                             \
+  {                                                                            \
+    cl_int clerr = (s);                                                        \
+    if (clerr != CL_SUCCESS)                                                   \
+      fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__,       \
+              oclErrorString(clerr));                                          \
+  }
+
+char *oclLoadProgSource(const char *cFilename, const char *cPreamble,
+                        size_t *szFinalLength);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h
index 36640fd99d5dc86b0509ab30724e419dbc4720c5..9cfe7257ba16f72cbec7e00faa1a078778e0ab50 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/config.h
@@ -1,10 +1,15 @@
-#define NUM_BIN 8 //the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU
-#define EXP 3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future architecture
-	//using EXP and shifting can speed up division operation 
-#define MOD_OP 7 // This variable is also related with NUM_BIN; may change in the future architecture;
-	//using MOD_OP and "bitwise and" can speed up mod operation
-#define INF 2147483647//2^31-1
-#define UP_LIMIT 16677216//2^24
+#define NUM_BIN                                                                \
+  8 // the number of duplicated frontiers used in BFS_kernel_multi_blk_inGPU
+#define EXP                                                                    \
+  3 // EXP = log(NUM_BIN), assuming NUM_BIN is still power of 2 in the future
+    // architecture
+    // using EXP and shifting can speed up division operation
+#define MOD_OP                                                                 \
+  7 // This variable is also related with NUM_BIN; may change in the future
+    // architecture;
+    // using MOD_OP and "bitwise and" can speed up mod operation
+#define INF 2147483647    // 2^31-1
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp
index 0a1b13ee1c677de5a129dfcf0adb78ce293718e6..3f9bc775574f597bdcf69c6999553c3c37bd352d 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/opencl_nvidia/main.cpp
@@ -9,36 +9,36 @@
   Implementing Breadth first search on CUDA using algorithm given in DAC'10
   paper "An Effective GPU Implementation of Breadth-First Search"
 
-  Copyright (c) 2010 University of Illinois at Urbana-Champaign. 
+  Copyright (c) 2010 University of Illinois at Urbana-Champaign.
   All rights reserved.
 
-  Permission to use, copy, modify and distribute this software and its documentation for 
-  educational purpose is hereby granted without fee, provided that the above copyright 
-  notice and this permission notice appear in all copies of this software and that you do 
-  not sell the software.
+  Permission to use, copy, modify and distribute this software and its
+  documentation for educational purpose is hereby granted without fee, provided
+  that the above copyright notice and this permission notice appear in all
+  copies of this software and that you do not sell the software.
 
-  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR 
-  OTHERWISE.
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS,
+  IMPLIED OR OTHERWISE.
 
   Author: Lijiuan Luo (lluo3@uiuc.edu)
-  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu)
+  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu
+  (gengliu2@illinois.edu)
 */
+#include "OpenCL_common.h"
+#include "config.h"
 #include <CL/cl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
 #include <parboil.h>
-#include "OpenCL_common.h"
-#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
-#define CHECK_ERROR(errorMessage)        \
-if(clStatus != CL_SUCCESS)               \
-{                                        \
-  printf("Error: %s!\n",errorMessage);   \
-  printf("Line: %d\n",__LINE__);         \
-  exit(1);                               \
-}
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
+  }
 
 FILE *fp;
 struct Node {
@@ -49,113 +49,110 @@ struct Edge {
   int x;
   int y;
 };
-char* readFile(const char* fileName) 
-{
-  FILE* fp;
-  fp = fopen(fileName,"r");
-  if(fp == NULL)
-  {
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
     printf("Error 1!\n");
     exit(1);
   }
 
-  fseek(fp,0,SEEK_END);
+  fseek(fp, 0, SEEK_END);
   long size = ftell(fp);
   rewind(fp);
 
-  char* buffer = (char*)malloc(sizeof(char)*size);
-  if(buffer  == NULL)
-  {
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
     printf("Error 2!\n");
     fclose(fp);
     exit(1);
   }
 
-  size_t res = fread(buffer,1,size,fp);
-  if(res != size)
-  {
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
     printf("Error 3!\n");
     fclose(fp);
     exit(1);
   }
 
-  fclose(fp);                                                                                    
+  fclose(fp);
   return buffer;
 }
 //#include "kernel.cl"
-//Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables for initialization
+// Somehow "cudaMemset" does not work. So I use cudaMemcpy of constant variables
+// for initialization
 const int h_top = 1;
 const int zero = 0;
 
-int BFS_GPU(cl_mem d_graph_nodes,cl_mem d_graph_edges, cl_mem d_color, cl_mem d_cost, cl_mem d_q1, cl_mem d_q2, cl_mem tail, int * source, cl_int clStatus, cl_command_queue clCommandQueue, cl_kernel BFS_kernel_S, cl_kernel BFS_kernel_M, cl_kernel BFS_kernel_L, cl_device_id clDevice, cl_context clContext){
-}
-void runGPU(int argc, char** argv);
+int BFS_GPU(cl_mem d_graph_nodes, cl_mem d_graph_edges, cl_mem d_color,
+            cl_mem d_cost, cl_mem d_q1, cl_mem d_q2, cl_mem tail, int *source,
+            cl_int clStatus, cl_command_queue clCommandQueue,
+            cl_kernel BFS_kernel_S, cl_kernel BFS_kernel_M,
+            cl_kernel BFS_kernel_L, cl_device_id clDevice,
+            cl_context clContext) {}
+void runGPU(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char** argv) 
-{
-  //the number of nodes in the graph
-  int num_of_nodes = 0; 
-  //the number of edges in the graph
+int main(int argc, char **argv) {
+  // the number of nodes in the graph
+  int num_of_nodes = 0;
+  // the number of edges in the graph
   int num_of_edges = 0;
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
 
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-  {
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) {
     fprintf(stderr, "Expecting one input filename\n");
     exit(-1);
   }
 
-  //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  //printf("Reading File\n");
-  //Read in Graph from a file
-  fp = fopen(params->inpFiles[0],"r");
-  if(!fp)
-  {
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // printf("Reading File\n");
+  // Read in Graph from a file
+  fp = fopen(params->inpFiles[0], "r");
+  if (!fp) {
     printf("Error Reading graph file\n");
     return 0;
   }
   int source;
 
-  fscanf(fp,"%d",&num_of_nodes);
+  fscanf(fp, "%d", &num_of_nodes);
   // allocate host memory
-  struct Node* h_graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes);
-  int *color = (int*) malloc(sizeof(int)*num_of_nodes);
-  int start, edgeno;   
+  struct Node *h_graph_nodes =
+      (struct Node *)malloc(sizeof(struct Node) * num_of_nodes);
+  int *color = (int *)malloc(sizeof(int) * num_of_nodes);
+  int start, edgeno;
   // initalize the memory
   int i;
-  for( i = 0; i < num_of_nodes; i++) 
-  {
-    fscanf(fp,"%d %d",&start,&edgeno);
+  for (i = 0; i < num_of_nodes; i++) {
+    fscanf(fp, "%d %d", &start, &edgeno);
     h_graph_nodes[i].x = start;
     h_graph_nodes[i].y = edgeno;
-    color[i]=WHITE;
+    color[i] = WHITE;
   }
-  //read the source node from the file
-  fscanf(fp,"%d",&source);
-  fscanf(fp,"%d",&num_of_edges);
-  int id,cost;
-  struct Edge* h_graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges);
-  for(i=0; i < num_of_edges ; i++)
-  {
-    fscanf(fp,"%d",&id);
-    fscanf(fp,"%d",&cost);
+  // read the source node from the file
+  fscanf(fp, "%d", &source);
+  fscanf(fp, "%d", &num_of_edges);
+  int id, cost;
+  struct Edge *h_graph_edges =
+      (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges);
+  for (i = 0; i < num_of_edges; i++) {
+    fscanf(fp, "%d", &id);
+    fscanf(fp, "%d", &cost);
     h_graph_edges[i].x = id;
     h_graph_edges[i].y = cost;
   }
-  if(fp)
-    fclose(fp);    
+  if (fp)
+    fclose(fp);
 
   pb_InitializeTimerSet(&timers);
 
-
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // allocate mem for the result on host side
-  int* h_cost = (int*) malloc( sizeof(int)*num_of_nodes);
-  for(i = 0; i < num_of_nodes; i++){
+  int *h_cost = (int *)malloc(sizeof(int) * num_of_nodes);
+  for (i = 0; i < num_of_nodes; i++) {
     h_cost[i] = INF;
   }
   h_cost[source] = 0;
@@ -165,319 +162,451 @@ int main(int argc, char** argv)
   cl_int clStatus;
   cl_device_id clDevice;
   cl_platform_id clPlatform;
-  OCL_ERRCK_RETVAL(clGetPlatformIDs(1,&clPlatform,NULL));
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  OCL_ERRCK_RETVAL(clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL));
-  size_t MAX_THREADS_PER_BLOCK = 0; 
-  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_THREADS_PER_BLOCK), &MAX_THREADS_PER_BLOCK, NULL);
-  if(MAX_THREADS_PER_BLOCK > 512)
+  OCL_ERRCK_RETVAL(clGetPlatformIDs(1, &clPlatform, NULL));
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  OCL_ERRCK_RETVAL(
+      clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL));
+  size_t MAX_THREADS_PER_BLOCK = 0;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                             sizeof(MAX_THREADS_PER_BLOCK),
+                             &MAX_THREADS_PER_BLOCK, NULL);
+  if (MAX_THREADS_PER_BLOCK > 512)
     MAX_THREADS_PER_BLOCK = 512;
   OCL_ERRCK_VAR(clStatus);
 
   int NUM_SM = 0;
-  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(NUM_SM), &NUM_SM, NULL);
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS,
+                             sizeof(NUM_SM), &NUM_SM, NULL);
   OCL_ERRCK_VAR(clStatus);
 
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource_path = "src/opencl_nvidia/kernel.cl";
+  const char *clSource_path = "src/opencl_nvidia/kernel.cl";
   size_t program_length;
-  char* clSource = oclLoadProgSource(clSource_path, "", &program_length);
-  //printf("Program Source:\n%s\n", clSource);
+  char *clSource = oclLoadProgSource(clSource_path, "", &program_length);
+  // printf("Program Source:\n%s\n", clSource);
   printf("Program building ...\n");
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,(const char**)&clSource, &program_length,&clStatus);
+  cl_program clProgram = clCreateProgramWithSource(
+      clContext, 1, (const char **)&clSource, &program_length, &clStatus);
   printf("Program built\n");
   OCL_ERRCK_VAR(clStatus);
 
   char clOptions[100];
-  //printf("NUM_SM = %d, MAX_THREADS_PER_BLOCK = %d\n", NUM_SM, MAX_THREADS_PER_BLOCK);
-  sprintf(clOptions,"-I src/opencl_nvidia -DMAX_THREADS_PER_BLOCK=%d -DNUM_SM=%d", MAX_THREADS_PER_BLOCK, NUM_SM);
-  OCL_ERRCK_RETVAL(clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL));
+  // printf("NUM_SM = %d, MAX_THREADS_PER_BLOCK = %d\n", NUM_SM,
+  // MAX_THREADS_PER_BLOCK);
+  sprintf(clOptions,
+          "-I src/opencl_nvidia -DMAX_THREADS_PER_BLOCK=%d -DNUM_SM=%d",
+          MAX_THREADS_PER_BLOCK, NUM_SM);
+  OCL_ERRCK_RETVAL(
+      clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL));
 
   // Uncomment to view build log from compiler for debugging
-  
+
   char *build_log;
   size_t ret_val_size;
-  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);  
-  build_log = (char *)malloc(ret_val_size+1);
-  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
-  // there's no information in the reference whether the string is 0 terminated or not
+  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0,
+                                   NULL, &ret_val_size);
+  build_log = (char *)malloc(ret_val_size + 1);
+  clStatus = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
+                                   ret_val_size, build_log, NULL);
+  // there's no information in the reference whether the string is 0 terminated
+  // or not
   build_log[ret_val_size] = '\0';
-  printf("%s\n", build_log );
-  
-
-  //Small kernel: only 1 block
-  cl_kernel BFS_kernel_S = clCreateKernel(clProgram,"BFS_in_GPU_kernel",&clStatus);
-  //Medium kernel: 1 block per SM
-  cl_kernel BFS_kernel_M = clCreateKernel(clProgram,"BFS_kernel_multi_blk_inGPU",&clStatus);
-  //Large kernel: No restriction
-  cl_kernel BFS_kernel_L = clCreateKernel(clProgram,"BFS_kernel",&clStatus);
+  printf("%s\n", build_log);
+
+  // Small kernel: only 1 block
+  cl_kernel BFS_kernel_S =
+      clCreateKernel(clProgram, "BFS_in_GPU_kernel", &clStatus);
+  // Medium kernel: 1 block per SM
+  cl_kernel BFS_kernel_M =
+      clCreateKernel(clProgram, "BFS_kernel_multi_blk_inGPU", &clStatus);
+  // Large kernel: No restriction
+  cl_kernel BFS_kernel_L = clCreateKernel(clProgram, "BFS_kernel", &clStatus);
   OCL_ERRCK_VAR(clStatus);
 
-  //Copy the Node list to device memory
+  // Copy the Node list to device memory
   cl_mem d_graph_nodes;
-  d_graph_nodes = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_nodes*sizeof(struct Node),NULL,&clStatus);
+  d_graph_nodes =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     num_of_nodes * sizeof(struct Node), NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_nodes,CL_TRUE,0,num_of_nodes*sizeof(struct Node),h_graph_nodes,0,NULL,NULL));
-  //Copy the Edge List to device Memory
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_nodes, CL_TRUE,
+                                        0, num_of_nodes * sizeof(struct Node),
+                                        h_graph_nodes, 0, NULL, NULL));
+  // Copy the Edge List to device Memory
   cl_mem d_graph_edges;
-  d_graph_edges = clCreateBuffer(clContext,CL_MEM_READ_ONLY,num_of_edges*sizeof(struct Edge),NULL,&clStatus);
+  d_graph_edges =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     num_of_edges * sizeof(struct Edge), NULL, &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_graph_edges,CL_TRUE,0,num_of_edges*sizeof(struct Edge),h_graph_edges,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_graph_edges, CL_TRUE,
+                                        0, num_of_edges * sizeof(struct Edge),
+                                        h_graph_edges, 0, NULL, NULL));
 
   cl_mem d_color, d_cost, d_q1, d_q2, tail;
-  d_color = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_cost = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_q1 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  d_q2 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,num_of_nodes*sizeof(int),NULL,&clStatus);
-  tail = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
+  d_color = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                           num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_cost = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                          num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_q1 = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                        num_of_nodes * sizeof(int), NULL, &clStatus);
+  d_q2 = clCreateBuffer(clContext, CL_MEM_READ_WRITE,
+                        num_of_nodes * sizeof(int), NULL, &clStatus);
+  tail = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                        &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_color, CL_TRUE, 0,
+                                        num_of_nodes * sizeof(int), color, 0,
+                                        NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                        num_of_nodes * sizeof(int), h_cost, 0,
+                                        NULL, NULL));
 
   printf("Starting GPU kernel\n");
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  int num_of_blocks; 
+  int num_of_blocks;
   int num_of_threads_per_block;
 
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&h_top,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_cost,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,d_q1,CL_TRUE,0,sizeof(int),&source,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                        sizeof(int), &h_top, 0, NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                        sizeof(int), &zero, 0, NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, d_q1, CL_TRUE, 0,
+                                        sizeof(int), &source, 0, NULL, NULL));
 
-  int num_t;//number of threads
-  int k=0;//BFS level index
+  int num_t; // number of threads
+  int k = 0; // BFS level index
 
   cl_mem switch_kd, num_td, global_kt_d;
-  switch_kd = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
-  num_td = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
-  global_kt_d = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
+  switch_kd = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                             &clStatus);
+  num_td = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                          &clStatus);
+  global_kt_d = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                               &clStatus);
   OCL_ERRCK_VAR(clStatus);
   int switch_k;
   int global_kt = 0;
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,global_kt_d,CL_TRUE,0,sizeof(int),&global_kt,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, global_kt_d, CL_TRUE, 0,
+                                        sizeof(int), &global_kt, 0, NULL,
+                                        NULL));
 
   cl_mem count;
   cl_mem num_of_nodes_vol;
   cl_mem stay_vol;
-  count = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
-  num_of_nodes_vol = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
-  stay_vol = clCreateBuffer(clContext,CL_MEM_READ_WRITE,sizeof(int),NULL,&clStatus);
+  count = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                         &clStatus);
+  num_of_nodes_vol = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int),
+                                    NULL, &clStatus);
+  stay_vol = clCreateBuffer(clContext, CL_MEM_READ_WRITE, sizeof(int), NULL,
+                            &clStatus);
   OCL_ERRCK_VAR(clStatus);
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,count,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,num_of_nodes_vol,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,stay_vol,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, count, CL_TRUE, 0,
+                                        sizeof(int), &zero, 0, NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, num_of_nodes_vol,
+                                        CL_TRUE, 0, sizeof(int), &zero, 0, NULL,
+                                        NULL));
+  OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, stay_vol, CL_TRUE, 0,
+                                        sizeof(int), &zero, 0, NULL, NULL));
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  //BFS_kernel_S arguments setup
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,2,sizeof(cl_mem),(void*)&d_graph_nodes));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,3,sizeof(cl_mem),(void*)&d_graph_edges));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,4,sizeof(cl_mem),(void*)&d_color));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,5,sizeof(cl_mem),(void*)&d_cost));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,6,sizeof(cl_mem),(void*)&tail));
-
-  //BFS_kernel_M arguments setup
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,2,sizeof(cl_mem),(void*)&d_graph_nodes));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,3,sizeof(cl_mem),(void*)&d_graph_edges));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,4,sizeof(cl_mem),(void*)&d_color));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,5,sizeof(cl_mem),(void*)&d_cost));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,6,sizeof(cl_mem),(void*)&num_td));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,7,sizeof(cl_mem),(void*)&tail));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,10,sizeof(cl_mem),(void*)&switch_kd));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,11,sizeof(cl_mem),(void*)&global_kt_d));
-  //volatile mem
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,19,sizeof(cl_mem),(void*)&count));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,20,sizeof(cl_mem),(void*)&num_of_nodes_vol));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,21,sizeof(cl_mem),(void*)&stay_vol));
-
-  //BFS_kernel_L arguments setup
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,2,sizeof(cl_mem),(void*)&d_graph_nodes));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,3,sizeof(cl_mem),(void*)&d_graph_edges));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,4,sizeof(cl_mem),(void*)&d_color));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,5,sizeof(cl_mem),(void*)&d_cost));
-  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,6,sizeof(cl_mem),(void*)&tail));
+  // BFS_kernel_S arguments setup
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_S, 2, sizeof(cl_mem), (void *)&d_graph_nodes));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_S, 3, sizeof(cl_mem), (void *)&d_graph_edges));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_S, 4, sizeof(cl_mem), (void *)&d_color));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_S, 5, sizeof(cl_mem), (void *)&d_cost));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_S, 6, sizeof(cl_mem), (void *)&tail));
+
+  // BFS_kernel_M arguments setup
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 2, sizeof(cl_mem), (void *)&d_graph_nodes));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 3, sizeof(cl_mem), (void *)&d_graph_edges));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 4, sizeof(cl_mem), (void *)&d_color));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 5, sizeof(cl_mem), (void *)&d_cost));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 6, sizeof(cl_mem), (void *)&num_td));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 7, sizeof(cl_mem), (void *)&tail));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 10, sizeof(cl_mem), (void *)&switch_kd));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 11, sizeof(cl_mem), (void *)&global_kt_d));
+  // volatile mem
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 19, sizeof(cl_mem), (void *)&count));
+  OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 20, sizeof(cl_mem),
+                                  (void *)&num_of_nodes_vol));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_M, 21, sizeof(cl_mem), (void *)&stay_vol));
+
+  // BFS_kernel_L arguments setup
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_L, 2, sizeof(cl_mem), (void *)&d_graph_nodes));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_L, 3, sizeof(cl_mem), (void *)&d_graph_edges));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_L, 4, sizeof(cl_mem), (void *)&d_color));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_L, 5, sizeof(cl_mem), (void *)&d_cost));
+  OCL_ERRCK_RETVAL(
+      clSetKernelArg(BFS_kernel_L, 6, sizeof(cl_mem), (void *)&tail));
 
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-  do
-  {
-    OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL));
-    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,tail,CL_TRUE,0,sizeof(int),&zero,0,NULL,NULL));
+  do {
+    OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                         sizeof(int), &num_t, 0, NULL, NULL));
+    OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, tail, CL_TRUE, 0,
+                                          sizeof(int), &zero, 0, NULL, NULL));
 
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    if(num_t == 0){//frontier is empty
+    if (num_t == 0) { // frontier is empty
       break;
     }
 
     num_of_blocks = 1;
     num_of_threads_per_block = num_t;
-    if(num_of_threads_per_block <NUM_BIN)
+    if (num_of_threads_per_block < NUM_BIN)
       num_of_threads_per_block = NUM_BIN;
-    if(num_t>MAX_THREADS_PER_BLOCK)
-    {
-      num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK); 
+    if (num_t > MAX_THREADS_PER_BLOCK) {
+      num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK);
       num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
     }
-    if(num_of_blocks == 1)//will call "BFS_in_GPU_kernel" 
-      num_of_threads_per_block = MAX_THREADS_PER_BLOCK; 
-    if(num_of_blocks >1 && num_of_blocks <= NUM_SM)// will call "BFS_kernel_multi_blk_inGPU"
+    if (num_of_blocks == 1) // will call "BFS_in_GPU_kernel"
+      num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
+    if (num_of_blocks > 1 &&
+        num_of_blocks <= NUM_SM) // will call "BFS_kernel_multi_blk_inGPU"
       num_of_blocks = NUM_SM;
 
-    //assume "num_of_blocks" can not be very large
-    size_t grid[1] = {num_of_blocks*num_of_threads_per_block};
+    // assume "num_of_blocks" can not be very large
+    size_t grid[1] = {num_of_blocks * num_of_threads_per_block};
     size_t block[1] = {num_of_threads_per_block};
 
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,7,sizeof(int),(void*)&num_t));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,9,sizeof(int),(void*)&k));
+    OCL_ERRCK_RETVAL(
+        clSetKernelArg(BFS_kernel_S, 7, sizeof(int), (void *)&num_t));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 9, sizeof(int), (void *)&k));
 
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,9,sizeof(int),(void*)&k));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 9, sizeof(int), (void *)&k));
 
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,7,sizeof(int),(void*)&num_t));
-    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,9,sizeof(int),(void*)&k));
+    OCL_ERRCK_RETVAL(
+        clSetKernelArg(BFS_kernel_L, 7, sizeof(int), (void *)&num_t));
+    OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 9, sizeof(int), (void *)&k));
 
-    if(k%2 == 0){
+    if (k % 2 == 0) {
       int gray = GRAY0;
-      if(num_of_blocks == 1) {
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,0,sizeof(cl_mem),(void*)&d_q1));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,1,sizeof(cl_mem),(void*)&d_q2));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,8,sizeof(int),(void*)&gray));
-        //shared_mem
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,10,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,12,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,13,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,14,MAX_THREADS_PER_BLOCK*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,15,sizeof(int),NULL));
+      if (num_of_blocks == 1) {
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 0, sizeof(cl_mem), (void *)&d_q1));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 1, sizeof(cl_mem), (void *)&d_q2));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 8, sizeof(int), (void *)&gray));
+        // shared_mem
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 10, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_S, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 12, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 13, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_S, 14, MAX_THREADS_PER_BLOCK * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 15, sizeof(int), NULL));
         pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_S,1,0,grid,block,0,0,0));
+        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_S, 1,
+                                                0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
       } else if (num_of_blocks <= NUM_SM) {
         pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-        OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,num_td,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL));
+        OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, num_td, CL_TRUE,
+                                              0, sizeof(int), &num_t, 0, NULL,
+                                              NULL));
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,0,sizeof(cl_mem),(void*)&d_q1));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,1,sizeof(cl_mem),(void*)&d_q2));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,8,sizeof(int),(void*)&gray));
-        //shared_mem
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,12,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,13,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,14,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,15,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,16,sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,17,sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,18,sizeof(int),NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 0, sizeof(cl_mem), (void *)&d_q1));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 1, sizeof(cl_mem), (void *)&d_q2));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 8, sizeof(int), (void *)&gray));
+        // shared_mem
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 12, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_M, 13, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 14, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 15, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 16, sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL));
 
         pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
-        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_M,1,0,grid,block,0,0,0));
+        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1,
+                                                0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
 
         pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-        OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,switch_kd,CL_TRUE,0,sizeof(int),&switch_k,0,NULL,NULL));
+        OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, switch_kd, CL_TRUE,
+                                             0, sizeof(int), &switch_k, 0, NULL,
+                                             NULL));
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        if(!switch_k){
+        if (!switch_k) {
           k--;
         }
       } else {
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,0,sizeof(cl_mem),(void*)&d_q1));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,1,sizeof(cl_mem),(void*)&d_q2));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,8,sizeof(int),(void*)&gray));
-        //shared_mem
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,10,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,12,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,13,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,14,sizeof(int),NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 0, sizeof(cl_mem), (void *)&d_q1));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 1, sizeof(cl_mem), (void *)&d_q2));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 8, sizeof(int), (void *)&gray));
+        // shared_mem
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 10, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_L, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 12, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL));
         pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_L,1,0,grid,block,0,0,0));
+        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1,
+                                                0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
       }
-    }
-    else {
+    } else {
       int gray = GRAY1;
-      if(num_of_blocks == 1) {
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,0,sizeof(cl_mem),(void*)&d_q2));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,1,sizeof(cl_mem),(void*)&d_q1));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,8,sizeof(int),(void*)&gray));
-        //shared_mem
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,10,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,12,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,13,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,14,MAX_THREADS_PER_BLOCK*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S,15,sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_S,1,0,grid,block,0,0,0));
+      if (num_of_blocks == 1) {
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 0, sizeof(cl_mem), (void *)&d_q2));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 1, sizeof(cl_mem), (void *)&d_q1));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 8, sizeof(int), (void *)&gray));
+        // shared_mem
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 10, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_S, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 12, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_S, 13, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_S, 14, MAX_THREADS_PER_BLOCK * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_S, 15, sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_S, 1,
+                                                0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
       } else if (num_of_blocks <= NUM_SM) {
-        OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue,num_td,CL_TRUE,0,sizeof(int),&num_t,0,NULL,NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,0,sizeof(cl_mem),(void*)&d_q2));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,1,sizeof(cl_mem),(void*)&d_q1));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,8,sizeof(int),(void*)&gray));
-        //shared_mem
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,12,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,13,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,14,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,15,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,16,sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,17,sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M,18,sizeof(int),NULL));
+        OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, num_td, CL_TRUE,
+                                              0, sizeof(int), &num_t, 0, NULL,
+                                              NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 0, sizeof(cl_mem), (void *)&d_q2));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 1, sizeof(cl_mem), (void *)&d_q1));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 8, sizeof(int), (void *)&gray));
+        // shared_mem
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 12, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_M, 13, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 14, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_M, 15, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 16, sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 17, sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_M, 18, sizeof(int), NULL));
         pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_M,1,0,grid,block,0,0,0));
+        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_M, 1,
+                                                0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
         pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-        OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,switch_kd,CL_TRUE,0,sizeof(int),&switch_k,0,NULL,NULL));
+        OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, switch_kd, CL_TRUE,
+                                             0, sizeof(int), &switch_k, 0, NULL,
+                                             NULL));
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        if(!switch_k){
+        if (!switch_k) {
           k--;
         }
       } else {
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,0,sizeof(cl_mem),(void*)&d_q2));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,1,sizeof(cl_mem),(void*)&d_q1));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,8,sizeof(int),(void*)&gray));
-        //shared_mem
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,10,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,11,NUM_BIN*W_QUEUE_SIZE*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,12,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,13,NUM_BIN*sizeof(int),NULL));
-        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L,14,sizeof(int),NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 0, sizeof(cl_mem), (void *)&d_q2));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 1, sizeof(cl_mem), (void *)&d_q1));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 8, sizeof(int), (void *)&gray));
+        // shared_mem
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 10, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(
+            BFS_kernel_L, 11, NUM_BIN * W_QUEUE_SIZE * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 12, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(
+            clSetKernelArg(BFS_kernel_L, 13, NUM_BIN * sizeof(int), NULL));
+        OCL_ERRCK_RETVAL(clSetKernelArg(BFS_kernel_L, 14, sizeof(int), NULL));
 
         pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue,BFS_kernel_L,1,0,grid,block,0,0,0));
+        OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, BFS_kernel_L, 1,
+                                                0, grid, block, 0, 0, 0));
         OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
 
         pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
       }
     }
     k++;
-  } while(1);
+  } while (1);
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
   // copy result from device to host
-  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_cost,CL_TRUE,0,num_of_nodes*sizeof(int),h_cost,0,NULL,NULL));
-  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue,d_color,CL_TRUE,0,num_of_nodes*sizeof(int),color,0,NULL,NULL));
+  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_cost, CL_TRUE, 0,
+                                       num_of_nodes * sizeof(int), h_cost, 0,
+                                       NULL, NULL));
+  OCL_ERRCK_RETVAL(clEnqueueReadBuffer(clCommandQueue, d_color, CL_TRUE, 0,
+                                       num_of_nodes * sizeof(int), color, 0,
+                                       NULL, NULL));
 
   OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_nodes));
   OCL_ERRCK_RETVAL(clReleaseMemObject(d_graph_edges));
   OCL_ERRCK_RETVAL(clReleaseMemObject(d_color));
   OCL_ERRCK_RETVAL(clReleaseMemObject(d_cost));
   OCL_ERRCK_RETVAL(clReleaseMemObject(tail));
-  
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  //Store the result into a file
-  //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  FILE *fp = fopen(params->outFile,"w");
+  // Store the result into a file
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  FILE *fp = fopen(params->outFile, "w");
   fprintf(fp, "%d\n", num_of_nodes);
   int j = 0;
-  for(j=0;j<num_of_nodes;j++)
-    fprintf(fp,"%d %d\n",j,h_cost[j]);
+  for (j = 0; j < num_of_nodes; j++)
+    fprintf(fp, "%d %d\n", j, h_cost[j]);
   fclose(fp);
 
   // cleanup memory
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h b/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h
index 1a00ef98e054e50e654b0a52ccbb05ce136bab27..f9cdb59e9cd6cc39364fd9389ee39216646aedb2 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h
+++ b/hpvm/test/parboil/benchmarks/bfs/src/visc/config.h
@@ -1,7 +1,8 @@
 #define MAX_THREADS_PER_BLOCK 256
-#define LOCAL_MEM_SIZE 1600 //This needs to be adjusted for certain graphs with high degrees
-#define INF 2147483647//2^31-1
-#define UP_LIMIT 16677216//2^24
+#define LOCAL_MEM_SIZE                                                         \
+  1600 // This needs to be adjusted for certain graphs with high degrees
+#define INF 2147483647    // 2^31-1
+#define UP_LIMIT 16677216 // 2^24
 #define WHITE 16677217
 #define GRAY 16677218
 #define GRAY0 16677219
diff --git a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp
index 6227ef498f10eb82e685f4dab518caf17e7757ac..9491218e5e93d39fc1bda4fac3c14770ee48645b 100644
--- a/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/bfs/src/visc/main.cpp
@@ -12,84 +12,84 @@
   Copyright (c) 2010 University of Illinois at Urbana-Champaign.
   All rights reserved.
 
-  Permission to use, copy, modify and distribute this software and its documentation for
-  educational purpose is hereby granted without fee, provided that the above copyright
-  notice and this permission notice appear in all copies of this software and that you do
-  not sell the software.
+  Permission to use, copy, modify and distribute this software and its
+  documentation for educational purpose is hereby granted without fee, provided
+  that the above copyright notice and this permission notice appear in all
+  copies of this software and that you do not sell the software.
 
-  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS, IMPLIED OR
-  OTHERWISE.
+  THE SOFTWARE IS PROVIDED "AS IS" AND WITHOUT WARRANTY OF ANY KIND,EXPRESS,
+  IMPLIED OR OTHERWISE.
 
   Author: Lijiuan Luo (lluo3@uiuc.edu)
-  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu (gengliu2@illinois.edu)
+  Revised for Parboil 2.5 Benchmark Suite by: Geng Daniel Liu
+  (gengliu2@illinois.edu)
 */
-#include <stdlib.h>
+#include "config.h"
+#include "parboil.h"
+#include <math.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "parboil.h"
-#include "config.h"
 #include <visc.h>
 
 /**********
 Define colors for BFS
-1) the definition of White, gray and black comes from the text book "Introduction to Algorithms"
-2) For path search problems, people may choose to use different colors to record the found paths.
-Therefore we reserve numbers (0-16677216) for this purpose. Only nodes with colors bigger than
-UP_LIMIT are free to visit 
-3) We define two gray shades to differentiate between the new frontier nodes and the old frontier nodes that
- have not been marked BLACK 
+1) the definition of White, gray and black comes from the text book
+"Introduction to Algorithms" 2) For path search problems, people may choose to
+use different colors to record the found paths. Therefore we reserve numbers
+(0-16677216) for this purpose. Only nodes with colors bigger than UP_LIMIT are
+free to visit 3) We define two gray shades to differentiate between the new
+frontier nodes and the old frontier nodes that have not been marked BLACK
 *************/
 
 //#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics: enable
 //#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
 //#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics: enable
 /*****************************************************************************
-This is the  most general version of BFS kernel, i.e. no assumption about #block in the grid  
-\param q1: the array to hold the current frontier
-\param q2: the array to hold the new frontier
-\param g_graph_nodes: the nodes in the input graph
-\param g_graph_edges: the edges i nthe input graph
-\param g_color: the colors of nodes
-\param g_cost: the costs of nodes
-\param no_of_nodes: the number of nodes in the current frontier
-\param tail: pointer to the location of the tail of the new frontier. *tail is the size of the new frontier 
-\param gray_shade: the shade of the gray in current BFS propagation. See GRAY0, GRAY1 macro definitions for more details
-\param k: the level of current propagation in the BFS tree. k= 0 for the first propagation.
+This is the  most general version of BFS kernel, i.e. no assumption about #block
+in the grid \param q1: the array to hold the current frontier \param q2: the
+array to hold the new frontier \param g_graph_nodes: the nodes in the input
+graph \param g_graph_edges: the edges i nthe input graph \param g_color: the
+colors of nodes \param g_cost: the costs of nodes \param no_of_nodes: the number
+of nodes in the current frontier \param tail: pointer to the location of the
+tail of the new frontier. *tail is the size of the new frontier \param
+gray_shade: the shade of the gray in current BFS propagation. See GRAY0, GRAY1
+macro definitions for more details \param k: the level of current propagation in
+the BFS tree. k= 0 for the first propagation.
 ***********************************************************************/
 
-//typedef struct {
+// typedef struct {
 //} VoidRetTy;
 
 typedef struct __attribute__((__packed__)) {
-  int* q1; size_t bytesq1; 
-  int* q2; size_t bytesq2; 
-  struct Node* graph_nodes; size_t bytes_graph_nodes; 
-  struct Edge* graph_edges; size_t bytes_graph_edges;
-  int* color; size_t bytes_color;
-  int* cost; size_t bytes_cost;
-  int* tail; size_t bytes_tail;
+  int *q1;
+  size_t bytesq1;
+  int *q2;
+  size_t bytesq2;
+  struct Node *graph_nodes;
+  size_t bytes_graph_nodes;
+  struct Edge *graph_edges;
+  size_t bytes_graph_edges;
+  int *color;
+  size_t bytes_color;
+  int *cost;
+  size_t bytes_cost;
+  int *tail;
+  size_t bytes_tail;
   int no_of_nodes;
   int gray_shade;
   int k;
   long block;
   long grid;
-  //VoidRetTy* out;
+  // VoidRetTy* out;
 } RootIn;
 
-void packData(RootIn* args,
-              int* q1, size_t bytesq1, 
-              int* q2, size_t bytesq2, 
-              struct Node* graph_nodes, size_t bytes_graph_nodes, 
-              struct Edge* graph_edges, size_t bytes_graph_edges,
-              int* color, size_t bytes_color,
-              int* cost, size_t bytes_cost,
-              int* tail, size_t bytes_tail,
-              int no_of_nodes,
-              int gray_shade,
-              int k,
-              long block,
-              long grid) {
+void packData(RootIn *args, int *q1, size_t bytesq1, int *q2, size_t bytesq2,
+              struct Node *graph_nodes, size_t bytes_graph_nodes,
+              struct Edge *graph_edges, size_t bytes_graph_edges, int *color,
+              size_t bytes_color, int *cost, size_t bytes_cost, int *tail,
+              size_t bytes_tail, int no_of_nodes, int gray_shade, int k,
+              long block, long grid) {
   args->q1 = q1;
   args->bytesq1 = bytesq1;
   args->q2 = q2;
@@ -113,78 +113,72 @@ void packData(RootIn* args,
 
 void Allocation(long block) {
   // Memory shared between threadblocks
-  void* local_q_tail = __visc__malloc(sizeof(int));
-  void* local_q = __visc__malloc(LOCAL_MEM_SIZE*sizeof(int));
-  void* shift = __visc__malloc(sizeof(int));
-  
-  __visc__return(6, local_q_tail, sizeof(int), local_q, LOCAL_MEM_SIZE*sizeof(int), shift, sizeof(int));
+  void *local_q_tail = __visc__malloc(sizeof(int));
+  void *local_q = __visc__malloc(LOCAL_MEM_SIZE * sizeof(int));
+  void *shift = __visc__malloc(sizeof(int));
+
+  __visc__return(6, local_q_tail, sizeof(int), local_q,
+                 LOCAL_MEM_SIZE * sizeof(int), shift, sizeof(int));
 }
 
-//VoidRetTy
-void
-BFSLeaf(int *q1, size_t bytesq1, 
-           int *q2, size_t bytesq2,
-           struct Node *g_graph_nodes, size_t bytesg_graph_nodes,
-           struct Edge *g_graph_edges, size_t bytesg_graph_edges,
-           int *g_color, size_t bytesg_color,
-           int *g_cost, size_t bytesg_cost,
-           int *tail, size_t bytestail,
-           int no_of_nodes,
-           int gray_shade, 
-           int k,
-           // data local to thread block. The next three arguments should
-           // ideally be placed in local memory
-           int *local_q_tail, size_t byteslocal_q_tail, 
-           int *local_q, size_t byteslocal_q,
-           int *shift, size_t bytesshift
-           ) 
-{
+// VoidRetTy
+void BFSLeaf(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
+             struct Node *g_graph_nodes, size_t bytesg_graph_nodes,
+             struct Edge *g_graph_edges, size_t bytesg_graph_edges,
+             int *g_color, size_t bytesg_color, int *g_cost, size_t bytesg_cost,
+             int *tail, size_t bytestail, int no_of_nodes, int gray_shade,
+             int k,
+             // data local to thread block. The next three arguments should
+             // ideally be placed in local memory
+             int *local_q_tail, size_t byteslocal_q_tail, int *local_q,
+             size_t byteslocal_q, int *shift, size_t bytesshift) {
 
   __visc__hint(visc::DEVICE);
   __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
-                      4, q2, g_color, g_cost, tail);
+                     4, q2, g_color, g_cost, tail);
 
-  void* thisNode = __visc__getNode();
-  void* parentNode = __visc__getParentNode(thisNode);
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
   int lx = __visc__getNodeInstanceID_x(thisNode);
   int gx = __visc__getNodeInstanceID_x(parentNode);
   int dimx = __visc__getNumNodeInstances_x(thisNode);
 
-  if(lx == 0){
-    *local_q_tail = 0;//initialize the tail of w-queue
+  if (lx == 0) {
+    *local_q_tail = 0; // initialize the tail of w-queue
   }
 
-  __visc__barrier(); 
+  __visc__barrier();
 
-  //first, propagate and add the new frontier elements into w-queues
-  //int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0);
+  // first, propagate and add the new frontier elements into w-queues
+  // int tid = get_group_id(0)*MAX_THREADS_PER_BLOCK + get_local_id(0);
   int tid = gx * dimx + lx;
 
-  if( tid<no_of_nodes)
-  {
-    int pid = q1[tid]; //the current frontier node, or the parent node of the new frontier nodes 
+  if (tid < no_of_nodes) {
+    int pid = q1[tid]; // the current frontier node, or the parent node of the
+                       // new frontier nodes
     g_color[pid] = BLACK;
     int cur_cost = g_cost[pid];
-    //into
+    // into
     struct Node cur_node = g_graph_nodes[pid];
 
-    for(int i=cur_node.x; i<cur_node.y + cur_node.x; i++)//visit each neighbor of the
-      //current frontier node.
+    for (int i = cur_node.x; i < cur_node.y + cur_node.x;
+         i++) // visit each neighbor of the
+              // current frontier node.
     {
       struct Edge cur_edge = g_graph_edges[i];
       int id = cur_edge.x;
       int cost = cur_edge.y;
       cost += cur_cost;
 
-      int orig_cost = __visc__atomic_min(&g_cost[id],cost);
-      if(orig_cost > cost){//the node should be visited
-        if(g_color[id] > UP_LIMIT){
-         int old_color = __visc__atomic_xchg(&g_color[id],gray_shade);
-          //this guarantees that only one thread will push this node
-          //into a queue
-          if(old_color != gray_shade) {
-            //atomic operation guarantees the correctness
-            //even if multiple warps are executing simultaneously
+      int orig_cost = __visc__atomic_min(&g_cost[id], cost);
+      if (orig_cost > cost) { // the node should be visited
+        if (g_color[id] > UP_LIMIT) {
+          int old_color = __visc__atomic_xchg(&g_color[id], gray_shade);
+          // this guarantees that only one thread will push this node
+          // into a queue
+          if (old_color != gray_shade) {
+            // atomic operation guarantees the correctness
+            // even if multiple warps are executing simultaneously
             int index = __visc__atomic_add(local_q_tail, 1);
             local_q[index] = id;
           }
@@ -192,111 +186,100 @@ BFSLeaf(int *q1, size_t bytesq1,
       }
     }
   }
-  
-  __visc__barrier(); 
-
-  if(lx == 0){
-    int tot_sum = *local_q_tail; 
-    //the offset or "shift" of the block-level queue within the grid-level queue
-    //is determined by atomic operation
-    *shift = __visc__atomic_add (tail,tot_sum);
+
+  __visc__barrier();
+
+  if (lx == 0) {
+    int tot_sum = *local_q_tail;
+    // the offset or "shift" of the block-level queue within the grid-level
+    // queue is determined by atomic operation
+    *shift = __visc__atomic_add(tail, tot_sum);
   }
-  
-  __visc__barrier(); 
 
-  //shift within a w-queue
+  __visc__barrier();
+
+  // shift within a w-queue
   int local_shift = lx;
 
-  while(local_shift < *local_q_tail){
+  while (local_shift < *local_q_tail) {
     q2[*shift + local_shift] = local_q[local_shift];
-    //multiple threads are copying elements at the same time,
-    //so we shift by multiple elements for next iteration  
+    // multiple threads are copying elements at the same time,
+    // so we shift by multiple elements for next iteration
     local_shift += dimx;
   }
 }
 
-//VoidRetTy
-void BlockingBFS(int *q1, size_t bytesq1, 
-                int *q2, size_t bytesq2,
-                struct Node *g_graph_nodes, size_t bytesg_graph_nodes,
-                struct Edge *g_graph_edges, size_t bytesg_graph_edges,
-                int *g_color, size_t bytesg_color,
-                int *g_cost, size_t bytesg_cost,
-                int *tail, size_t bytestail,
-                int no_of_nodes,
-                int gray_shade, 
-                int k,
-                long block,
-                // data local to thread block. The next three arguments should
-                // ideally be placed in local memory
-                int *local_q_tail, size_t byteslocal_q_tail, 
-                int *local_q, size_t byteslocal_q,
-                int *shift, size_t bytesshift) {
+// VoidRetTy
+void BlockingBFS(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
+                 struct Node *g_graph_nodes, size_t bytesg_graph_nodes,
+                 struct Edge *g_graph_edges, size_t bytesg_graph_edges,
+                 int *g_color, size_t bytesg_color, int *g_cost,
+                 size_t bytesg_cost, int *tail, size_t bytestail,
+                 int no_of_nodes, int gray_shade, int k, long block,
+                 // data local to thread block. The next three arguments should
+                 // ideally be placed in local memory
+                 int *local_q_tail, size_t byteslocal_q_tail, int *local_q,
+                 size_t byteslocal_q, int *shift, size_t bytesshift) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
-                      4, q2, g_color, g_cost, tail);
+                     4, q2, g_color, g_cost, tail);
 
-  void* AllocationNode = __visc__createNodeND(0, Allocation);
-  void* BFSLeafNode = __visc__createNodeND(1, BFSLeaf, block);
+  void *AllocationNode = __visc__createNodeND(0, Allocation);
+  void *BFSLeafNode = __visc__createNodeND(1, BFSLeaf, block);
 
   // Bind edges
   __visc__bindIn(AllocationNode, 17, 0, 0); // Bind block
-  __visc__bindIn(BFSLeafNode, 0, 0, 0); // Bind q1
-  __visc__bindIn(BFSLeafNode, 1, 1, 0); // Bind bytes_q1
-  __visc__bindIn(BFSLeafNode, 2, 2, 0); // Bind q2
-  __visc__bindIn(BFSLeafNode, 3, 3, 0); // Bind bytes_q2 
-  __visc__bindIn(BFSLeafNode, 4, 4, 0); // Bind graph_nodes
-  __visc__bindIn(BFSLeafNode, 5, 5, 0); // Bind bytes_graph_nodes
-  __visc__bindIn(BFSLeafNode, 6, 6, 0); // Bind graph_edges
-  __visc__bindIn(BFSLeafNode, 7, 7, 0); // Bind bytes_graph_edges
-  __visc__bindIn(BFSLeafNode, 8, 8, 0); // Bind color
-  __visc__bindIn(BFSLeafNode, 9, 9, 0); // Bind bytes_color
-  __visc__bindIn(BFSLeafNode, 10, 10, 0); // Bind cost
-  __visc__bindIn(BFSLeafNode, 11, 11, 0); // Bind bytes_cost
-  __visc__bindIn(BFSLeafNode, 12, 12, 0); // Bind tail
-  __visc__bindIn(BFSLeafNode, 13, 13, 0); // Bind bytes_tail
-  __visc__bindIn(BFSLeafNode, 14, 14, 0); // Bind no_of_nodes
-  __visc__bindIn(BFSLeafNode, 15, 15, 0); // Bind gray_shade
-  __visc__bindIn(BFSLeafNode, 16, 16, 0); // Bind k
-  
+  __visc__bindIn(BFSLeafNode, 0, 0, 0);     // Bind q1
+  __visc__bindIn(BFSLeafNode, 1, 1, 0);     // Bind bytes_q1
+  __visc__bindIn(BFSLeafNode, 2, 2, 0);     // Bind q2
+  __visc__bindIn(BFSLeafNode, 3, 3, 0);     // Bind bytes_q2
+  __visc__bindIn(BFSLeafNode, 4, 4, 0);     // Bind graph_nodes
+  __visc__bindIn(BFSLeafNode, 5, 5, 0);     // Bind bytes_graph_nodes
+  __visc__bindIn(BFSLeafNode, 6, 6, 0);     // Bind graph_edges
+  __visc__bindIn(BFSLeafNode, 7, 7, 0);     // Bind bytes_graph_edges
+  __visc__bindIn(BFSLeafNode, 8, 8, 0);     // Bind color
+  __visc__bindIn(BFSLeafNode, 9, 9, 0);     // Bind bytes_color
+  __visc__bindIn(BFSLeafNode, 10, 10, 0);   // Bind cost
+  __visc__bindIn(BFSLeafNode, 11, 11, 0);   // Bind bytes_cost
+  __visc__bindIn(BFSLeafNode, 12, 12, 0);   // Bind tail
+  __visc__bindIn(BFSLeafNode, 13, 13, 0);   // Bind bytes_tail
+  __visc__bindIn(BFSLeafNode, 14, 14, 0);   // Bind no_of_nodes
+  __visc__bindIn(BFSLeafNode, 15, 15, 0);   // Bind gray_shade
+  __visc__bindIn(BFSLeafNode, 16, 16, 0);   // Bind k
+
   // Create Edges between AllocationNode and BFSLeafNodeNode
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail 
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 1, 18, 0); // Edge bytes_local_q_tail 
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q 
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q 
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift 
-  __visc__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift 
+  __visc__edge(AllocationNode, BFSLeafNode, 1, 0, 17, 0); // Edge local_q_tail
+  __visc__edge(AllocationNode, BFSLeafNode, 1, 1, 18,
+               0); // Edge bytes_local_q_tail
+  __visc__edge(AllocationNode, BFSLeafNode, 1, 2, 19, 0); // Edge local_q
+  __visc__edge(AllocationNode, BFSLeafNode, 1, 3, 20, 0); // Edge bytes_local_q
+  __visc__edge(AllocationNode, BFSLeafNode, 1, 4, 21, 0); // Edge shift
+  __visc__edge(AllocationNode, BFSLeafNode, 1, 5, 22, 0); // Edge bytes_shift
 }
 
-//VoidRetTy
-void BFS_Root(int *q1, size_t bytesq1, 
-              int *q2, size_t bytesq2,
+// VoidRetTy
+void BFS_Root(int *q1, size_t bytesq1, int *q2, size_t bytesq2,
               struct Node *g_graph_nodes, size_t bytesg_graph_nodes,
               struct Edge *g_graph_edges, size_t bytesg_graph_edges,
-              int *g_color, size_t bytesg_color,
-              int *g_cost, size_t bytesg_cost,
-              int *tail, size_t bytestail,
-              int no_of_nodes,
-              int gray_shade, 
-              int k,
-              long block,
-              long grid) {
+              int *g_color, size_t bytesg_color, int *g_cost,
+              size_t bytesg_cost, int *tail, size_t bytestail, int no_of_nodes,
+              int gray_shade, int k, long block, long grid) {
   __visc__hint(visc::CPU_TARGET);
-  __visc__attributes( 6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
-                      4, q2, g_color, g_cost, tail);
-  void* BlockingBFSNode = __visc__createNodeND(1, BlockingBFS, grid);
+  __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
+                     4, q2, g_color, g_cost, tail);
+  void *BlockingBFSNode = __visc__createNodeND(1, BlockingBFS, grid);
 
   // Bind edges
-  __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1
-  __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1
-  __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2
-  __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 
-  __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes
-  __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes
-  __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges
-  __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges
-  __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color
-  __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color
+  __visc__bindIn(BlockingBFSNode, 0, 0, 0);   // Bind q1
+  __visc__bindIn(BlockingBFSNode, 1, 1, 0);   // Bind bytes_q1
+  __visc__bindIn(BlockingBFSNode, 2, 2, 0);   // Bind q2
+  __visc__bindIn(BlockingBFSNode, 3, 3, 0);   // Bind bytes_q2
+  __visc__bindIn(BlockingBFSNode, 4, 4, 0);   // Bind graph_nodes
+  __visc__bindIn(BlockingBFSNode, 5, 5, 0);   // Bind bytes_graph_nodes
+  __visc__bindIn(BlockingBFSNode, 6, 6, 0);   // Bind graph_edges
+  __visc__bindIn(BlockingBFSNode, 7, 7, 0);   // Bind bytes_graph_edges
+  __visc__bindIn(BlockingBFSNode, 8, 8, 0);   // Bind color
+  __visc__bindIn(BlockingBFSNode, 9, 9, 0);   // Bind bytes_color
   __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost
   __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost
   __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail
@@ -305,39 +288,34 @@ void BFS_Root(int *q1, size_t bytesq1,
   __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade
   __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k
   __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block
-
-  
 }
 
-void BFS_Wrapper(
-  int *q1, size_t bytesq1, // 0, 1
-  int *q2, size_t bytesq2, // 2, 3
-  struct Node *g_graph_nodes, size_t bytesg_graph_nodes, // 4, 5
-  struct Edge *g_graph_edges, size_t bytesg_graph_edges, // 6, 7
-  int *g_color, size_t bytesg_color, // 8, 9
-  int *g_cost, size_t bytesg_cost, // 10, 11
-  int *tail, size_t bytestail, // 12, 13
-  int no_of_nodes, int gray_shade, // 14, 15
-  int k, long block, long grid // 16 - 18
+void BFS_Wrapper(int *q1, size_t bytesq1,                               // 0, 1
+                 int *q2, size_t bytesq2,                               // 2, 3
+                 struct Node *g_graph_nodes, size_t bytesg_graph_nodes, // 4, 5
+                 struct Edge *g_graph_edges, size_t bytesg_graph_edges, // 6, 7
+                 int *g_color, size_t bytesg_color,                     // 8, 9
+                 int *g_cost, size_t bytesg_cost, // 10, 11
+                 int *tail, size_t bytestail,     // 12, 13
+                 int no_of_nodes, int gray_shade, // 14, 15
+                 int k, long block, long grid     // 16 - 18
 ) {
   __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(
-    6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
-    4, q2, g_color, g_cost, tail
-  );
-  void* BlockingBFSNode = __visc__createNodeND(0, BFS_Root);
+  __visc__attributes(6, q1, g_graph_nodes, g_graph_edges, g_color, g_cost, tail,
+                     4, q2, g_color, g_cost, tail);
+  void *BlockingBFSNode = __visc__createNodeND(0, BFS_Root);
 
   // Bind edges
-  __visc__bindIn(BlockingBFSNode, 0, 0, 0); // Bind q1
-  __visc__bindIn(BlockingBFSNode, 1, 1, 0); // Bind bytes_q1
-  __visc__bindIn(BlockingBFSNode, 2, 2, 0); // Bind q2
-  __visc__bindIn(BlockingBFSNode, 3, 3, 0); // Bind bytes_q2 
-  __visc__bindIn(BlockingBFSNode, 4, 4, 0); // Bind graph_nodes
-  __visc__bindIn(BlockingBFSNode, 5, 5, 0); // Bind bytes_graph_nodes
-  __visc__bindIn(BlockingBFSNode, 6, 6, 0); // Bind graph_edges
-  __visc__bindIn(BlockingBFSNode, 7, 7, 0); // Bind bytes_graph_edges
-  __visc__bindIn(BlockingBFSNode, 8, 8, 0); // Bind color
-  __visc__bindIn(BlockingBFSNode, 9, 9, 0); // Bind bytes_color
+  __visc__bindIn(BlockingBFSNode, 0, 0, 0);   // Bind q1
+  __visc__bindIn(BlockingBFSNode, 1, 1, 0);   // Bind bytes_q1
+  __visc__bindIn(BlockingBFSNode, 2, 2, 0);   // Bind q2
+  __visc__bindIn(BlockingBFSNode, 3, 3, 0);   // Bind bytes_q2
+  __visc__bindIn(BlockingBFSNode, 4, 4, 0);   // Bind graph_nodes
+  __visc__bindIn(BlockingBFSNode, 5, 5, 0);   // Bind bytes_graph_nodes
+  __visc__bindIn(BlockingBFSNode, 6, 6, 0);   // Bind graph_edges
+  __visc__bindIn(BlockingBFSNode, 7, 7, 0);   // Bind bytes_graph_edges
+  __visc__bindIn(BlockingBFSNode, 8, 8, 0);   // Bind color
+  __visc__bindIn(BlockingBFSNode, 9, 9, 0);   // Bind bytes_color
   __visc__bindIn(BlockingBFSNode, 10, 10, 0); // Bind cost
   __visc__bindIn(BlockingBFSNode, 11, 11, 0); // Bind bytes_cost
   __visc__bindIn(BlockingBFSNode, 12, 12, 0); // Bind tail
@@ -345,35 +323,31 @@ void BFS_Wrapper(
   __visc__bindIn(BlockingBFSNode, 14, 14, 0); // Bind no_of_nodes
   __visc__bindIn(BlockingBFSNode, 15, 15, 0); // Bind gray_shade
   __visc__bindIn(BlockingBFSNode, 16, 16, 0); // Bind k
-  __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block 
-  __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid 
+  __visc__bindIn(BlockingBFSNode, 17, 17, 0); // Bind block
+  __visc__bindIn(BlockingBFSNode, 18, 18, 0); // Bind grid
 }
 
 FILE *fp;
-char* readFile(const char* fileName)
-{
-  FILE* fp;
-  fp = fopen(fileName,"r");
-  if(fp == NULL)
-  {
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
     printf("Error 1!\n");
     exit(1);
   }
 
-  fseek(fp,0,SEEK_END);
+  fseek(fp, 0, SEEK_END);
   long size = ftell(fp);
   rewind(fp);
 
-  char* buffer = (char*)malloc(sizeof(char)*size);
-  if(buffer  == NULL)
-  {
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
     printf("Error 2!\n");
     fclose(fp);
     exit(1);
   }
-  size_t res = fread(buffer,1,size,fp);
-  if(res != size)
-  {
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
     printf("Error 3!\n");
     fclose(fp);
     exit(1);
@@ -384,63 +358,60 @@ char* readFile(const char* fileName)
 }
 const int h_top = 1;
 const int zero = 0;
-void runGPU(int argc, char** argv);
+void runGPU(int argc, char **argv);
 ////////////////////////////////////////////////////////////////////////////////
 // Main Program
 ////////////////////////////////////////////////////////////////////////////////
-int main( int argc, char** argv)
-{
+int main(int argc, char **argv) {
 
-  //the number of nodes in the graph
+  // the number of nodes in the graph
   int num_of_nodes = 0;
-  //the number of edges in the graph
+  // the number of edges in the graph
   int num_of_edges = 0;
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
 
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-  {
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) {
     fprintf(stderr, "Expecting one input filename\n");
     exit(-1);
   }
 
-  //Read in Graph from a file
-  fp = fopen(params->inpFiles[0],"r");
-  if(!fp)
-  {
+  // Read in Graph from a file
+  fp = fopen(params->inpFiles[0], "r");
+  if (!fp) {
     printf("Error Reading graph file\n");
     return 0;
   }
   int source;
 
-  fscanf(fp,"%d",&num_of_nodes);
+  fscanf(fp, "%d", &num_of_nodes);
   // allocate host memory
-  struct Node* graph_nodes = (struct Node*) malloc(sizeof(struct Node)*num_of_nodes);
-  int *color = (int*) malloc(sizeof(int)*num_of_nodes);
+  struct Node *graph_nodes =
+      (struct Node *)malloc(sizeof(struct Node) * num_of_nodes);
+  int *color = (int *)malloc(sizeof(int) * num_of_nodes);
   int start, edgeno;
   // initalize the memory
   int i;
-  for( i = 0; i < num_of_nodes; i++)
-  {
-    fscanf(fp,"%d %d",&start,&edgeno);
+  for (i = 0; i < num_of_nodes; i++) {
+    fscanf(fp, "%d %d", &start, &edgeno);
     graph_nodes[i].x = start;
     graph_nodes[i].y = edgeno;
-    color[i]=WHITE;
+    color[i] = WHITE;
   }
-  //read the source node from the file
-  fscanf(fp,"%d",&source);
-  fscanf(fp,"%d",&num_of_edges);
-  int id,edge_cost;
-  struct Edge* graph_edges = (struct Edge*) malloc(sizeof(struct Edge)*num_of_edges);
-  for(i=0; i < num_of_edges ; i++)
-  {
-    fscanf(fp,"%d",&id);
-    fscanf(fp,"%d",&edge_cost);
+  // read the source node from the file
+  fscanf(fp, "%d", &source);
+  fscanf(fp, "%d", &num_of_edges);
+  int id, edge_cost;
+  struct Edge *graph_edges =
+      (struct Edge *)malloc(sizeof(struct Edge) * num_of_edges);
+  for (i = 0; i < num_of_edges; i++) {
+    fscanf(fp, "%d", &id);
+    fscanf(fp, "%d", &edge_cost);
     graph_edges[i].x = id;
     graph_edges[i].y = edge_cost;
   }
-  if(fp)
+  if (fp)
     fclose(fp);
 
   pb_InitializeTimerSet(&timers);
@@ -448,19 +419,19 @@ int main( int argc, char** argv)
 
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // allocate mem for the result on host side
-  int* cost = (int*) malloc( sizeof(int)*num_of_nodes);
-  for(i = 0; i < num_of_nodes; i++){
+  int *cost = (int *)malloc(sizeof(int) * num_of_nodes);
+  for (i = 0; i < num_of_nodes; i++) {
     cost[i] = INF;
   }
   cost[source] = 0;
 
-  size_t bytes_graph_nodes = num_of_nodes* sizeof(struct Node);
-  size_t bytes_graph_edges = num_of_edges* sizeof(struct Edge);
-  size_t bytes_cost = sizeof(int) * num_of_nodes; 
+  size_t bytes_graph_nodes = num_of_nodes * sizeof(struct Node);
+  size_t bytes_graph_edges = num_of_edges * sizeof(struct Edge);
+  size_t bytes_cost = sizeof(int) * num_of_nodes;
 
-  int* q1 = (int*) malloc(sizeof(int)*num_of_nodes);
-  int* q2 = (int*) malloc(sizeof(int)*num_of_nodes);
-  int* tail = (int*) malloc(sizeof(int));
+  int *q1 = (int *)malloc(sizeof(int) * num_of_nodes);
+  int *q2 = (int *)malloc(sizeof(int) * num_of_nodes);
+  int *tail = (int *)malloc(sizeof(int));
 
   llvm_visc_track_mem(graph_nodes, bytes_graph_nodes);
   llvm_visc_track_mem(graph_edges, bytes_graph_edges);
@@ -478,50 +449,38 @@ int main( int argc, char** argv)
   // Initializations. Can some of these be done in the graph. That way we can
   // move these arrays completely in the graph
   *tail = h_top;
-  // Potential source of inefficiency. 
-  //Entire array would be copied intially
+  // Potential source of inefficiency.
+  // Entire array would be copied intially
   cost[0] = zero;
   q1[0] = source;
 
-  int num_t;//number of threads
-  int k=0;//BFS level index
+  int num_t; // number of threads
+  int k = 0; // BFS level index
   int gray;
 
   long grid = num_of_blocks;
   long block = num_of_threads_per_block;
   // Pack data in struct
-  RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-  packData(args, 
-          q1, bytes_cost, 
-          q2, bytes_cost,
-          graph_nodes, bytes_graph_nodes,
-          graph_edges, bytes_graph_edges,
-          color, bytes_cost,
-          cost, bytes_cost,
-          tail, sizeof(int),
-          num_of_nodes,
-          gray, 
-          k,
-          block,
-          grid
-          );
-
+  RootIn *args = (RootIn *)malloc(sizeof(RootIn));
+  packData(args, q1, bytes_cost, q2, bytes_cost, graph_nodes, bytes_graph_nodes,
+           graph_edges, bytes_graph_edges, color, bytes_cost, cost, bytes_cost,
+           tail, sizeof(int), num_of_nodes, gray, k, block, grid);
 
   pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  do
-  {
+  do {
     llvm_visc_request_mem(tail, sizeof(int));
     num_t = *tail;
-    //printf("tail for iteration %d = %d\n",k, num_t);
+    // printf("tail for iteration %d = %d\n",k, num_t);
     *tail = 0;
-    //tail = 0;
+    // tail = 0;
 
-    if(num_t == 0){//frontier is empty
+    if (num_t == 0) { // frontier is empty
       break;
     }
 
-    num_of_blocks = (int)ceil(num_t/(double)MAX_THREADS_PER_BLOCK);
-    num_of_threads_per_block = num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;
+    num_of_blocks = (int)ceil(num_t / (double)MAX_THREADS_PER_BLOCK);
+    num_of_threads_per_block =
+        num_t > MAX_THREADS_PER_BLOCK ? MAX_THREADS_PER_BLOCK : num_t;
 
     args->grid = num_of_blocks;
     args->block = num_of_threads_per_block;
@@ -529,33 +488,32 @@ int main( int argc, char** argv)
     args->no_of_nodes = num_t;
     args->k = k;
 
-    if(k%2 == 0){
+    if (k % 2 == 0) {
       args->gray_shade = GRAY0;
-    }
-    else{
+    } else {
       args->gray_shade = GRAY1;
     }
-    //void* bfsDFG = __visc__node(BFS_kernel, 2, 1, block, grid, 17, 
-                                //q1, bytes_cost, 
-                                //q2, bytes_cost,
-                                //graph_nodes, bytes_graph_nodes,
-                                //graph_edges, bytes_graph_edges,
-                                //color, bytes_cost,
-                                //cost, bytes_cost,
-                                //tail, sizeof(int),
-                                //num_of_nodes,
-                                //gray, 
-                                //k,
-                                //0);
-    void* bfsDFG = __visc__launch(0, BFS_Wrapper, (void*) args);
+    // void* bfsDFG = __visc__node(BFS_kernel, 2, 1, block, grid, 17,
+    // q1, bytes_cost,
+    // q2, bytes_cost,
+    // graph_nodes, bytes_graph_nodes,
+    // graph_edges, bytes_graph_edges,
+    // color, bytes_cost,
+    // cost, bytes_cost,
+    // tail, sizeof(int),
+    // num_of_nodes,
+    // gray,
+    // k,
+    // 0);
+    void *bfsDFG = __visc__launch(0, BFS_Wrapper, (void *)args);
     __visc__wait(bfsDFG);
     // Swap q1 and q2
     // Swap q1 and q2
-    int* temp = args->q1;
+    int *temp = args->q1;
     args->q1 = args->q2;
     args->q2 = temp;
     k++;
-  } while(1);
+  } while (1);
 
   // copy result from device to host
   pb_SwitchToTimer(&timers, pb_TimerID_COPY);
@@ -577,13 +535,13 @@ int main( int argc, char** argv)
   pb_PrintTimerSet(&timers);
   __visc__cleanup();
 
-  //Store the result into a file
-  //FIXME: color is not even printed. Why are we reading it back??
-  FILE *fp = fopen(params->outFile,"w");
+  // Store the result into a file
+  // FIXME: color is not even printed. Why are we reading it back??
+  FILE *fp = fopen(params->outFile, "w");
   fprintf(fp, "%d\n", num_of_nodes);
   int j = 0;
-  for(j=0;j<num_of_nodes;j++)
-    fprintf(fp,"%d %d\n",j,cost[j]);
+  for (j = 0; j < num_of_nodes; j++)
+    fprintf(fp, "%d %d\n", j, cost[j]);
   fclose(fp);
   // cleanup memory
   free(graph_nodes);
@@ -593,7 +551,6 @@ int main( int argc, char** argv)
   free(q1);
   free(q2);
 
-
   pb_FreeParameters(params);
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c
index 14d183cc985acc6f3c6c1a2c1af5598314c186fa..e54192c9d32a1512e73ac1ea98689dda1d7d0169 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutcpu.c
@@ -6,25 +6,24 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "cutoff.h"
 
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -39,8 +38,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -62,7 +61,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -75,44 +74,45 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -123,26 +123,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -150,27 +157,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
             *pg += e;
           }
 #endif
@@ -178,7 +184,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -186,8 +192,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h
index 477e5649b6ff4f58690fb80a017f8bcec86d135c..0f8b0ff96aaab0c84bfca49c112b717d568815b9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/cutoff.h
@@ -15,46 +15,44 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
-
-    /* Lowest corner of lattice */
-    Vec3 lo;
-
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
-
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
-
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
-
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
-
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
-
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
+
+  /* Lowest corner of lattice */
+  Vec3 lo;
+
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
+
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
+
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
+
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
+
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c
index 9598bda26b98a3f26bc36c9b616f11048c2e5860..26769b76d1dac3310e6f5066f3393133091d6477 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/excl.c
@@ -6,22 +6,20 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -33,8 +31,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -62,44 +60,45 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -110,42 +109,49 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c
index 9b8ef2014dc7deab1e9238be8e9d9ea1d0cf4a38..d361c16a34a6821dff328235a3c8fd59283734bd 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/main.c
@@ -6,27 +6,26 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "parboil.h"
 #include "atom.h"
 #include "cutoff.h"
 #include "output.h"
+#include "parboil.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #define ERRTOL 1e-4f
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
 int appenddata(const char *filename, int size, double time) {
   FILE *fp;
-  fp=fopen(filename, "a");
+  fp = fopen(filename, "a");
   if (fp == NULL) {
     printf("error appending to file %s..\n", filename);
     return -1;
@@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) {
   return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
   LatticeDim ret;
 
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
   ret.lo = lo;
   ret.h = h;
 
   return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
+Lattice *create_lattice(LatticeDim dim) {
   int size;
   Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
@@ -75,10 +70,7 @@ create_lattice(LatticeDim dim)
   return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
+void destroy_lattice(Lattice *lat) {
   if (lat) {
     free(lat->lattice);
     free(lat);
@@ -90,13 +82,13 @@ int main(int argc, char *argv[]) {
 
   LatticeDim lattice_dim;
   Lattice *cpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
   int n;
 
@@ -136,9 +128,10 @@ int main(int argc, char *argv[]) {
   printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
 
   printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
 
   lattice_dim = lattice_from_bounding_box(lo, hi, h);
   cpu_lattice = create_lattice(lattice_dim);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c
index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.c
@@ -6,16 +6,14 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <math.h>
 #include "atom.h"
 #include "cutoff.h"
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
   }
@@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice)
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
   }
 
@@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h
index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/base/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c
index 14d183cc985acc6f3c6c1a2c1af5598314c186fa..e54192c9d32a1512e73ac1ea98689dda1d7d0169 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutcpu.c
@@ -6,25 +6,24 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "cutoff.h"
 
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -39,8 +38,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -62,7 +61,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -75,44 +74,45 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -123,26 +123,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -150,27 +157,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
             *pg += e;
           }
 #endif
@@ -178,7 +184,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -186,8 +192,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h
index d5949745afaa90d234b949f75ac9e534931c748c..7c5d265a9b2e865f82a197642e1a1a4201cc0e78 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/cutoff.h
@@ -17,54 +17,51 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
 
-    /* Lowest corner of lattice */
-    Vec3 lo;
+  /* Lowest corner of lattice */
+  Vec3 lo;
 
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
 
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
 
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
 
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
 
-  int gpu_compute_cutoff_potential_lattice6overlap(
-      struct pb_TimerSet *timers,        /* for measuring execution time */
-      Lattice *lattice,
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms,                      /* array of atoms */
-      int verbose                        /* print info/debug messages */
-    );
+int gpu_compute_cutoff_potential_lattice6overlap(
+    struct pb_TimerSet *timers,     /* for measuring execution time */
+    Lattice *lattice, float cutoff, /* cutoff distance */
+    Atoms *atoms,                   /* array of atoms */
+    int verbose                     /* print info/debug messages */
+);
 
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
 
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c
index 9598bda26b98a3f26bc36c9b616f11048c2e5860..26769b76d1dac3310e6f5066f3393133091d6477 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/excl.c
@@ -6,22 +6,20 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -33,8 +31,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -62,44 +60,45 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -110,42 +109,49 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c
index 6cada17e4ede54d75d0f611259847ffb3cffb707..763ddcbb7316795e2de433a4ea0dd9be467fc831 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/main.c
@@ -6,27 +6,26 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "parboil.h"
 #include "atom.h"
 #include "cutoff.h"
 #include "output.h"
+#include "parboil.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #define ERRTOL 1e-4f
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
 int appenddata(const char *filename, int size, double time) {
   FILE *fp;
-  fp=fopen(filename, "a");
+  fp = fopen(filename, "a");
   if (fp == NULL) {
     printf("error appending to file %s..\n", filename);
     return -1;
@@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) {
   return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
   LatticeDim ret;
 
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
   ret.lo = lo;
   ret.h = h;
 
   return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
+Lattice *create_lattice(LatticeDim dim) {
   int size;
   Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
@@ -75,10 +70,7 @@ create_lattice(LatticeDim dim)
   return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
+void destroy_lattice(Lattice *lat) {
   if (lat) {
     free(lat->lattice);
     free(lat);
@@ -90,13 +82,13 @@ int main(int argc, char *argv[]) {
 
   LatticeDim lattice_dim;
   Lattice *gpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
   int n;
 
@@ -136,9 +128,10 @@ int main(int argc, char *argv[]) {
   printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
 
   printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
 
   lattice_dim = lattice_from_bounding_box(lo, hi, h);
   gpu_lattice = create_lattice(lattice_dim);
@@ -147,7 +140,8 @@ int main(int argc, char *argv[]) {
    *  CUDA kernel, with overlapped GPU/CPU computation
    *  (enter and exit with the 'compute' timer active)
    */
-  if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) {
+  if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff,
+                                                   atom, 0)) {
     fprintf(stderr, "Computation failed\n");
     exit(1);
   }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c
index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.c
@@ -6,16 +6,14 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <math.h>
 #include "atom.h"
 #include "cutoff.h"
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
   }
@@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice)
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
   }
 
@@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h
index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c
index f409c29e46276417918da1db9c1a785d1eaa39ae..ba029f4c8f3271ea45666e36b2a24de5ac9bbff5 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutcpu.c
@@ -6,26 +6,25 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include "parboil.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "parboil.h"
-#include "cutoff.h"
 
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -40,8 +39,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -63,7 +62,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -76,44 +75,45 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -124,26 +124,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -151,27 +158,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
             *pg += e;
           }
 #endif
@@ -179,7 +185,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -187,8 +193,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h
index a4e8d1ae94a901c0e07ec15ef216bac6c544007e..e8f8978e93cb1f235774d246385810af1254bde2 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/cutoff.h
@@ -17,54 +17,51 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
 
-    /* Lowest corner of lattice */
-    Vec3 lo;
+  /* Lowest corner of lattice */
+  Vec3 lo;
 
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
 
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
 
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
 
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
 
-  int gpu_compute_cutoff_potential_lattice(
-      struct pb_TimerSet *timers,
-      Lattice *lattice,
-      float cutoff,                      /* cutoff distance */
-      Atoms *atom,                       /* array of atoms */
-      int verbose                        /* print info/debug messages */
-    );
+int gpu_compute_cutoff_potential_lattice(
+    struct pb_TimerSet *timers, Lattice *lattice,
+    float cutoff, /* cutoff distance */
+    Atoms *atom,  /* array of atoms */
+    int verbose   /* print info/debug messages */
+);
 
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
 
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c
index 9598bda26b98a3f26bc36c9b616f11048c2e5860..26769b76d1dac3310e6f5066f3393133091d6477 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/excl.c
@@ -6,22 +6,20 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -33,8 +31,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -62,44 +60,45 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -110,42 +109,49 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c
index 905b7226312a9b0736b1b6d4deb18118c92476cc..3819e18adfbc33f7943fab4784cfddd513eab60b 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/main.c
@@ -6,27 +6,26 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "parboil.h"
 #include "atom.h"
 #include "cutoff.h"
 #include "output.h"
+#include "parboil.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #define ERRTOL 1e-4f
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
 int appenddata(const char *filename, int size, double time) {
   FILE *fp;
-  fp=fopen(filename, "a");
+  fp = fopen(filename, "a");
   if (fp == NULL) {
     printf("error appending to file %s..\n", filename);
     return -1;
@@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) {
   return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
   LatticeDim ret;
 
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
   ret.lo = lo;
   ret.h = h;
 
   return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
+Lattice *create_lattice(LatticeDim dim) {
   int size;
   Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
@@ -75,10 +70,7 @@ create_lattice(LatticeDim dim)
   return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
+void destroy_lattice(Lattice *lat) {
   if (lat) {
     free(lat->lattice);
     free(lat);
@@ -90,13 +82,13 @@ int main(int argc, char *argv[]) {
 
   LatticeDim lattice_dim;
   Lattice *gpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
   int n;
 
@@ -136,9 +128,10 @@ int main(int argc, char *argv[]) {
   printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
 
   printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
 
   lattice_dim = lattice_from_bounding_box(lo, hi, h);
   gpu_lattice = create_lattice(lattice_dim);
@@ -148,7 +141,8 @@ int main(int argc, char *argv[]) {
    *  Run CUDA kernel
    *  (enter and exit with the 'compute' timer active)
    */
-  if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0)) {
+  if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom,
+                                           0)) {
     fprintf(stderr, "Computation failed\n");
     exit(1);
   }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c
index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.c
@@ -6,16 +6,14 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <math.h>
 #include "atom.h"
 #include "cutoff.h"
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
   }
@@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice)
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
   }
 
@@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h
index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/cuda_base/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c
index 372903e6b00d7600d71d0596be3f1287fd8e927f..5ad77220e3656676845975992adba245153510d7 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutcpu.c
@@ -6,25 +6,24 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "cutoff.h"
 
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -39,8 +38,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -62,7 +61,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -75,49 +74,49 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
-#pragma omp parallel for private (n, q, x, y, z, ic, jc, kc, ia, ib, ja, jb, ka, kb, \
-				  xstart, ystart, dz, k, koff, dz2, j, dy, jkoff,    \
-				  dydz2, dx, index, pg, i, r2, s, e		     \
-				 )
+#pragma omp parallel for private(n, q, x, y, z, ic, jc, kc, ia, ib, ja, jb,    \
+                                 ka, kb, xstart, ystart, dz, k, koff, dz2, j,  \
+                                 dy, jkoff, dydz2, dx, index, pg, i, r2, s, e)
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -128,26 +127,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -155,27 +161,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
 
 #pragma omp atomic
             *pg += e;
@@ -185,7 +190,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -193,8 +198,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h
index 477e5649b6ff4f58690fb80a017f8bcec86d135c..0f8b0ff96aaab0c84bfca49c112b717d568815b9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/cutoff.h
@@ -15,46 +15,44 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
-
-    /* Lowest corner of lattice */
-    Vec3 lo;
-
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
-
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
-
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
-
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
-
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
-
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
+
+  /* Lowest corner of lattice */
+  Vec3 lo;
+
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
+
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
+
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
+
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
+
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c
index e157941114a0a51e8c60080d726d02d8e62d9fd4..ac36cc63fe87bb39485e989e3fe8e784699a0eb6 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/excl.c
@@ -6,22 +6,20 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-#include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -33,8 +31,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -62,48 +60,49 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
-#pragma omp parallel for private(n, x, y, z, q, ic, jc, kc, ia, ib, ja, jb, \
-				 ka, kb, xstart, ystart, dz, k, koff, dz2,  \
-				 dy, j, jkoff, dydz2, dx, index, pg, i, r2)
+#pragma omp parallel for private(n, x, y, z, q, ic, jc, kc, ia, ib, ja, jb,    \
+                                 ka, kb, xstart, ystart, dz, k, koff, dz2, dy, \
+                                 j, jkoff, dydz2, dx, index, pg, i, r2)
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -114,45 +113,52 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
 
-//All threads are writing the same value
-//No need for an atomic update
-            if (r2 < a2) *pg = 0;
+            // All threads are writing the same value
+            // No need for an atomic update
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c
index 9b8ef2014dc7deab1e9238be8e9d9ea1d0cf4a38..d361c16a34a6821dff328235a3c8fd59283734bd 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/main.c
@@ -6,27 +6,26 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "parboil.h"
 #include "atom.h"
 #include "cutoff.h"
 #include "output.h"
+#include "parboil.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #define ERRTOL 1e-4f
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
 int appenddata(const char *filename, int size, double time) {
   FILE *fp;
-  fp=fopen(filename, "a");
+  fp = fopen(filename, "a");
   if (fp == NULL) {
     printf("error appending to file %s..\n", filename);
     return -1;
@@ -36,23 +35,19 @@ int appenddata(const char *filename, int size, double time) {
   return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
   LatticeDim ret;
 
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
   ret.lo = lo;
   ret.h = h;
 
   return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
+Lattice *create_lattice(LatticeDim dim) {
   int size;
   Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
@@ -75,10 +70,7 @@ create_lattice(LatticeDim dim)
   return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
+void destroy_lattice(Lattice *lat) {
   if (lat) {
     free(lat->lattice);
     free(lat);
@@ -90,13 +82,13 @@ int main(int argc, char *argv[]) {
 
   LatticeDim lattice_dim;
   Lattice *cpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
   int n;
 
@@ -136,9 +128,10 @@ int main(int argc, char *argv[]) {
   printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
 
   printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
 
   lattice_dim = lattice_from_bounding_box(lo, hi, h);
   cpu_lattice = create_lattice(lattice_dim);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c
index 814e2d4d8b045d4ed02acb22760623ece3b248ff..e3559f3a35c0875b03f7e1327025c0a1da5c6698 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.c
@@ -6,16 +6,14 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <math.h>
 #include "atom.h"
 #include "cutoff.h"
+#include <inttypes.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -36,9 +34,9 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
   }
@@ -47,7 +45,7 @@ write_lattice_summary(const char *filename, Lattice *lattice)
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
   }
 
@@ -56,8 +54,8 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h
index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/omp_base/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c
index 5f440752a5951de65f5e0e51bba214fea37157e8..faca2a682a351f894b3cebcd9ccd8c176f6250b1 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutcpu.c
@@ -6,26 +6,25 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include "cutoff.h"
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
-#include "atom.h"
-#include "cutoff.h"
 
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -40,8 +39,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -63,7 +62,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -76,44 +75,45 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -124,26 +124,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -151,27 +158,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
             *pg += e;
           }
 #endif
@@ -179,7 +185,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -187,8 +193,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c
index f0567d19c76f19f1c3632b8784d50ef6f077b7cd..dcd0a629cb9cd765683415895376144816765e64 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.c
@@ -8,11 +8,11 @@
 
 #include <CL/cl.h>
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
@@ -23,15 +23,13 @@
 // we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
 typedef cl_int4 xyz;
 
-//extern "C" int gpu_compute_cutoff_potential_lattice(
+// extern "C" int gpu_compute_cutoff_potential_lattice(
 int gpu_compute_cutoff_potential_lattice(
-    struct pb_TimerSet *timers,
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms,                      /* array of atoms */
-    int verbose                        /* print info/debug messages */
-    )
-{
+    struct pb_TimerSet *timers, Lattice *lattice, /* the lattice */
+    float cutoff,                                 /* cutoff distance */
+    Atoms *atoms,                                 /* array of atoms */
+    int verbose /* print info/debug messages */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -45,8 +43,8 @@ int gpu_compute_cutoff_potential_lattice(
   xyz nbrlist[NBRLIST_MAXLEN];
   int nbrlistlen = 0;
 
-  int binHistoFull[BIN_DEPTH+1] = { 0 };   /* clear every array element */
-  int binHistoCover[BIN_DEPTH+1] = { 0 };  /* clear every array element */
+  int binHistoFull[BIN_DEPTH + 1] = {0};  /* clear every array element */
+  int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */
   int num_excluded = 0;
 
   int xRegionDim, yRegionDim, zRegionDim;
@@ -80,9 +78,9 @@ int gpu_compute_cutoff_potential_lattice(
   // The "compute" timer should be active upon entry to this function
 
   /* pad lattice to be factor of 8 in each dimension */
-  xRegionDim = (int) ceilf(nx/8.f);
-  yRegionDim = (int) ceilf(ny/8.f);
-  zRegionDim = (int) ceilf(nz/8.f);
+  xRegionDim = (int)ceilf(nx / 8.f);
+  yRegionDim = (int)ceilf(ny / 8.f);
+  zRegionDim = (int)ceilf(nz / 8.f);
 
   lnx = 8 * xRegionDim;
   lny = 8 * yRegionDim;
@@ -90,35 +88,36 @@ int gpu_compute_cutoff_potential_lattice(
   lnall = lnx * lny * lnz;
 
   /* will receive energies from OpenCL */
-  regionZeroAddr = (float *) malloc(lnall * sizeof(float));
+  regionZeroAddr = (float *)malloc(lnall * sizeof(float));
 
   /* create bins */
-  c = (int) ceil(cutoff * BIN_INVLEN);  /* count extra bins around lattice */
-  binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
-  binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
-  binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
+  c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
+  binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c;
+  binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c;
+  binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c;
   nbins = binDim.x * binDim.y * binDim.z;
-  binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
+  binBaseAddr = (cl_float4 *)calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
   binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
 
-  bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
+  bincntBaseAddr = (int *)calloc(nbins, sizeof(int));
   bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
 
   /* create neighbor list */
-  if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
+  if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) {
     float s = sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
     int cnt = 0;
     /* develop neighbor list around 1 cell */
-    if (2*c + 1 > NBRLIST_DIM) {
+    if (2 * c + 1 > NBRLIST_DIM) {
       fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-1)/2 * BIN_LENGTH);
+              (NBRLIST_DIM - 1) / 2 * BIN_LENGTH);
       return -1;
     }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
           nbrlist[cnt].x = i;
           nbrlist[cnt].y = j;
           nbrlist[cnt].z = k;
@@ -127,21 +126,21 @@ int gpu_compute_cutoff_potential_lattice(
       }
     }
     nbrlistlen = cnt;
-  }
-  else if (8*h <= 2*BIN_LENGTH) {
-    float s = 2.f*sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+  } else if (8 * h <= 2 * BIN_LENGTH) {
+    float s = 2.f * sqrtf(3);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
     int cnt = 0;
     /* develop neighbor list around 3-cube of cells */
-    if (2*c + 3 > NBRLIST_DIM) {
+    if (2 * c + 3 > NBRLIST_DIM) {
       fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-3)/2 * BIN_LENGTH);
+              (NBRLIST_DIM - 3) / 2 * BIN_LENGTH);
       return -1;
     }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
           nbrlist[cnt].x = i;
           nbrlist[cnt].y = j;
           nbrlist[cnt].z = k;
@@ -150,8 +149,7 @@ int gpu_compute_cutoff_potential_lattice(
       }
     }
     nbrlistlen = cnt;
-  }
-  else {
+  } else {
     fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
     return -1;
   }
@@ -159,43 +157,39 @@ int gpu_compute_cutoff_potential_lattice(
   /* perform geometric hashing of atoms into bins */
   {
     /* array of extra atoms, permit average of one extra per bin */
-    Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
+    Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom));
     int extra_len = 0;
-    
-    for (n = 0;  n < natoms;  n++) {
+
+    for (n = 0; n < natoms; n++) {
       cl_float4 p;
       p.x = atom[n].x - xlo;
       p.y = atom[n].y - ylo;
       p.z = atom[n].z - zlo;
       p.w = atom[n].q;
-      i = (int) floorf(p.x * BIN_INVLEN);
-      j = (int) floorf(p.y * BIN_INVLEN);
-      k = (int) floorf(p.z * BIN_INVLEN);
-      if (i >= -c && i < binDim.x - c &&
-	  j >= -c && j < binDim.y - c &&
-	  k >= -c && k < binDim.z - c &&
-	  atom[n].q != 0) {
-	int index = (k * binDim.y + j) * binDim.x + i;
-	cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
-	int bindex = bincntZeroAddr[index];
-	if (bindex < BIN_DEPTH) {
-	  /* copy atom into bin and increase counter for this bin */
-	  bin[bindex] = p;
-	  bincntZeroAddr[index]++;
-	}
-	else {
-	  /* add index to array of extra atoms to be computed with CPU */
-	  if (extra_len >= nbins) {
-	    fprintf(stderr, "exceeded space for storing extra atoms\n");
-	    return -1;
-	  }
-	  extra_atoms[extra_len] = atom[n];
-	  extra_len++;
-	}
-      }
-      else {
-	/* excluded atoms are either outside bins or neutrally charged */
-	num_excluded++;
+      i = (int)floorf(p.x * BIN_INVLEN);
+      j = (int)floorf(p.y * BIN_INVLEN);
+      k = (int)floorf(p.z * BIN_INVLEN);
+      if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c &&
+          k >= -c && k < binDim.z - c && atom[n].q != 0) {
+        int index = (k * binDim.y + j) * binDim.x + i;
+        cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
+        int bindex = bincntZeroAddr[index];
+        if (bindex < BIN_DEPTH) {
+          /* copy atom into bin and increase counter for this bin */
+          bin[bindex] = p;
+          bincntZeroAddr[index]++;
+        } else {
+          /* add index to array of extra atoms to be computed with CPU */
+          if (extra_len >= nbins) {
+            fprintf(stderr, "exceeded space for storing extra atoms\n");
+            return -1;
+          }
+          extra_atoms[extra_len] = atom[n];
+          extra_len++;
+        }
+      } else {
+        /* excluded atoms are either outside bins or neutrally charged */
+        num_excluded++;
       }
     }
 
@@ -207,24 +201,24 @@ int gpu_compute_cutoff_potential_lattice(
 
   /* bin stats */
   sum = total = 0;
-  for (n = 0;  n < nbins;  n++) {
-    binHistoFull[ bincntBaseAddr[n] ]++;
+  for (n = 0; n < nbins; n++) {
+    binHistoFull[bincntBaseAddr[n]]++;
     sum += bincntBaseAddr[n];
     total += BIN_DEPTH;
   }
-  avgFillFull = sum / (float) total;
+  avgFillFull = sum / (float)total;
   sum = total = 0;
-  for (k = 0;  k < binDim.z - 2*c;  k++) {
-    for (j = 0;  j < binDim.y - 2*c;  j++) {
-      for (i = 0;  i < binDim.x - 2*c;  i++) {
+  for (k = 0; k < binDim.z - 2 * c; k++) {
+    for (j = 0; j < binDim.y - 2 * c; j++) {
+      for (i = 0; i < binDim.x - 2 * c; i++) {
         int index = (k * binDim.y + j) * binDim.x + i;
-        binHistoCover[ bincntZeroAddr[index] ]++;
+        binHistoCover[bincntZeroAddr[index]]++;
         sum += bincntZeroAddr[index];
         total += BIN_DEPTH;
       }
     }
   }
-  avgFillCover = sum / (float) total;
+  avgFillCover = sum / (float)total;
 
   if (verbose) {
     /* report */
@@ -233,25 +227,25 @@ int gpu_compute_cutoff_potential_lattice(
     printf("cutoff distance = %g\n", cutoff);
     printf("\n");
     printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
-    printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
+    printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h);
     printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
-    printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
-    printf("number of bytes for lattice data = %u\n", lnall*sizeof(float));
+    printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h);
+    printf("number of bytes for lattice data = %u\n", lnall * sizeof(float));
     printf("\n");
     printf("bin padding thickness = %d\n", c);
-    printf("bin cover dimensions = %d %d %d\n",
-        binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
+    printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c,
+           binDim.y - 2 * c, binDim.z - 2 * c);
     printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
     printf("number of bins = %d\n", nbins);
     printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
     printf("%% overhead space = %g\n",
-        (natoms / (double) (nbins * BIN_DEPTH)) * 100);
+           (natoms / (double)(nbins * BIN_DEPTH)) * 100);
     printf("number of bytes for bin data = %u\n",
-        nbins * BIN_DEPTH * sizeof(cl_float4));
+           nbins * BIN_DEPTH * sizeof(cl_float4));
     printf("\n");
     printf("bin histogram with padding:\n");
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       printf("     number of bins with %d atoms:  %d\n", n, binHistoFull[n]);
       sum += binHistoFull[n];
     }
@@ -260,7 +254,7 @@ int gpu_compute_cutoff_potential_lattice(
     printf("\n");
     printf("bin histogram excluding padding:\n");
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       printf("     number of bins with %d atoms:  %d\n", n, binHistoCover[n]);
       sum += binHistoCover[n];
     }
@@ -268,24 +262,27 @@ int gpu_compute_cutoff_potential_lattice(
     printf("     %% average fill:  %g\n", avgFillCover * 100);
     printf("\n");
     printf("number of extra atoms = %d\n", extra->size);
-    printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
+    printf("%% atoms that are extra = %g\n",
+           (extra->size / (double)natoms) * 100);
     printf("\n");
 
     /* sanity check on bins */
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       sum += n * binHistoFull[n];
     }
     sum += extra->size + num_excluded;
     printf("sanity check on bin histogram with edges:  "
-        "sum + others = %d\n", sum);
+           "sum + others = %d\n",
+           sum);
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       sum += n * binHistoCover[n];
     }
     sum += extra->size + num_excluded;
     printf("sanity check on bin histogram excluding edges:  "
-        "sum + others = %d\n", sum);
+           "sum + others = %d\n",
+           sum);
     printf("\n");
 
     /* neighbor list */
@@ -295,34 +292,39 @@ int gpu_compute_cutoff_potential_lattice(
 
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs");
 
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
-  
-  const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+
+  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_base");  //-cl-nv-verbose
+  sprintf(clOptions, "-I src/opencl_base"); //-cl-nv-verbose
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
+  cl_kernel clKernel =
+      clCreateKernel(clProgram, "opencl_cutoff_potential_lattice", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
   /* setup OpenCL kernel parameters */
@@ -337,66 +339,75 @@ int gpu_compute_cutoff_potential_lattice(
   pb_SwitchToTimer(timers, pb_TimerID_COPY);
   if (verbose) {
     printf("Allocating %.2fMB on OpenCL device for potentials\n",
-           lnall * sizeof(float) / (double) (1024*1024));
+           lnall * sizeof(float) / (double)(1024 * 1024));
   }
 
-  regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
+  regionZeroCl = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                                lnall * sizeof(float), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
-  clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
+  clMemSet(clCommandQueue, regionZeroCl, 0, lnall * sizeof(float));
 
   if (verbose) {
     printf("Allocating %.2fMB on OpenCL device for atom bins\n",
-           nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
+           nbins * BIN_DEPTH * sizeof(cl_float4) / (double)(1024 * 1024));
   }
 
-  binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
+  binBaseCl =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     nbins * BIN_DEPTH * sizeof(cl_float4), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
- 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, binBaseCl, CL_TRUE, 0,
+                                  nbins * BIN_DEPTH * sizeof(cl_float4),
+                                  binBaseAddr, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  //Sub buffers are not supported in OpenCL v1.0
-  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;  
+  // Sub buffers are not supported in OpenCL v1.0
+  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
 
-  NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
+  NbrListLen =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(int), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, NbrListLen, CL_TRUE, 0,
+                                  sizeof(int), &nbrlistlen, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
+  NbrList = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                           NBRLIST_MAXLEN * sizeof(xyz), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, NbrList, CL_TRUE, 0,
+                           nbrlistlen * sizeof(xyz), nbrlist, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  if (verbose) 
+  if (verbose)
     printf("\n");
 
-
-  clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
-  clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
-  clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(int), &(binDim.x));
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), &(binDim.y));
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &binBaseCl);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &offset);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(float), &h);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(float), &cutoff2);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(float), &inv_cutoff2);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &regionZeroCl);
+  clStatus = clSetKernelArg(clKernel, 9, sizeof(cl_mem), &NbrListLen);
+  clStatus = clSetKernelArg(clKernel, 10, sizeof(cl_mem), &NbrList);
   CHECK_ERROR("clSetKernelArg")
 
-
   /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
   pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
   printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
-  for (zRegionIndex = 0;  zRegionIndex < zRegionDim;  zRegionIndex++) {
+  for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
     printf("  computing plane %d\r", zRegionIndex);
     fflush(stdout);
 
-    clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
+    clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &zRegionIndex);
     CHECK_ERROR("clSetKernelArg")
 
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL,
+                                      gridDim, blockDim, 0, NULL, NULL);
     CHECK_ERROR("clEnqueueNDRangeKernel")
     clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
@@ -406,7 +417,9 @@ int gpu_compute_cutoff_potential_lattice(
 
   /* copy result regions from OpenCL device */
   pb_SwitchToTimer(timers, pb_TimerID_COPY);
-  clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, regionZeroCl, CL_TRUE, 0,
+                          lnall * sizeof(float), regionZeroAddr, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
   /* free OpenCL memory allocations */
@@ -421,25 +434,26 @@ int gpu_compute_cutoff_potential_lattice(
   clStatus = clReleaseCommandQueue(clCommandQueue);
   clStatus = clReleaseContext(clContext);
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   /* transpose regions back into lattice */
   pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-  for (k = 0;  k < nz;  k++) {
+  for (k = 0; k < nz; k++) {
     zRegionIndex = (k >> 3);
     zOffset = (k & 7);
 
-    for (j = 0;  j < ny;  j++) {
+    for (j = 0; j < ny; j++) {
       yRegionIndex = (j >> 3);
       yOffset = (j & 7);
 
-      for (i = 0;  i < nx;  i++) {
+      for (i = 0; i < nx; i++) {
         xRegionIndex = (i >> 3);
         xOffset = (i & 7);
 
-        thisRegion = regionZeroAddr
-          + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
-              + xRegionIndex) * REGION_SIZE;
+        thisRegion = regionZeroAddr +
+                     ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim +
+                      xRegionIndex) *
+                         REGION_SIZE;
 
         indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
         index = (k * ny + j) * nx + i;
@@ -454,7 +468,7 @@ int gpu_compute_cutoff_potential_lattice(
     printf("computing extra atoms on CPU\n");
     if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
       fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
-          "for extra atoms\n");
+                      "for extra atoms\n");
       return -1;
     }
     printf("\n");
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h
index 883738c120465a53fd8af91a1d3845994d5144d3..c3e011bd14bda43b07a1eb82b0c436d18d1c8356 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/cutoff.h
@@ -15,54 +15,51 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
 
-    /* Lowest corner of lattice */
-    Vec3 lo;
+  /* Lowest corner of lattice */
+  Vec3 lo;
 
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
 
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
 
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
 
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
 
-  int gpu_compute_cutoff_potential_lattice(
-      struct pb_TimerSet *timers,
-      Lattice *lattice,
-      float cutoff,                      /* cutoff distance */
-      Atoms *atom,                       /* array of atoms */
-      int verbose                        /* print info/debug messages */
-    );
+int gpu_compute_cutoff_potential_lattice(
+    struct pb_TimerSet *timers, Lattice *lattice,
+    float cutoff, /* cutoff distance */
+    Atoms *atom,  /* array of atoms */
+    int verbose   /* print info/debug messages */
+);
 
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
 
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c
index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/excl.c
@@ -6,24 +6,22 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -35,8 +33,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -64,44 +62,45 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -112,42 +111,49 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h
index fed9553c9629207ff7592e3a1ff320eed027c1fb..adb557123d07c16baabba79d727ff8cfd2c3ad83 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/macros.h
@@ -4,22 +4,24 @@
 #ifdef __DEVICE_EMULATION__
 #define DEBUG
 /* define which grid block and which thread to examine */
-#define BX  0
-#define BY  0
-#define TX  0
-#define TY  0
-#define TZ  0
-#define EMU(code) do { \
-  if (blockIdx.x==BX && blockIdx.y==BY && \
-      threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
-    code; \
-  } \
-} while (0)
-#define INT(n)    printf("%s = %d\n", #n, n)
-#define FLOAT(f)  printf("%s = %g\n", #f, (double)(f))
-#define INT3(n)   printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
-#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
-    (double)(f).y, (double)(f).z, (double)(f).w)
+#define BX 0
+#define BY 0
+#define TX 0
+#define TY 0
+#define TZ 0
+#define EMU(code)                                                              \
+  do {                                                                         \
+    if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX &&           \
+        threadIdx.y == TY && threadIdx.z == TZ) {                              \
+      code;                                                                    \
+    }                                                                          \
+  } while (0)
+#define INT(n) printf("%s = %d\n", #n, n)
+#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
+#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
+#define FLOAT4(f)                                                              \
+  printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y,               \
+         (double)(f).z, (double)(f).w)
 #else
 #define EMU(code)
 #define INT(n)
@@ -29,12 +31,11 @@
 #endif
 
 /* report error from OpenCL */
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 /*
@@ -45,7 +46,7 @@
  * reserve enough memory for 11^3 stencil of grid cells
  * this fits within 16K of memory
  */
-#define NBRLIST_DIM  11
+#define NBRLIST_DIM 11
 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
 
 /*
@@ -54,16 +55,16 @@
  * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
  * should permit scheduling of up to 3 thread blocks per SM
  */
-#define BIN_DEPTH         8  /* max number of atoms per bin */
-#define BIN_SIZE         32  /* size of bin in floats */
-#define BIN_CACHE_MAXLEN 32  /* max number of atom bins to cache */
+#define BIN_DEPTH 8         /* max number of atoms per bin */
+#define BIN_SIZE 32         /* size of bin in floats */
+#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
 
-#define BIN_LENGTH      4.f  /* spatial length in Angstroms */
-#define BIN_INVLEN  (1.f / BIN_LENGTH)
+#define BIN_LENGTH 4.f /* spatial length in Angstroms */
+#define BIN_INVLEN (1.f / BIN_LENGTH)
 /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
  * so that bin fill should be 80% (for non-empty regions of space) */
 
-#define REGION_SIZE     512  /* number of floats in lattice region */
-#define SUB_REGION_SIZE 128  /* number of floats in lattice sub-region */
+#define REGION_SIZE 512     /* number of floats in lattice region */
+#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c
index 9bc6837371b847b8e7a2dd99945ce635d48d1f66..4bc31d3e9b78ddf32d4a3617e69c7bc3e4bc62c6 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/main.c
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
@@ -18,16 +18,15 @@
 
 #define ERRTOL 1e-4f
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
 int appenddata(const char *filename, int size, double time) {
   FILE *fp;
-  fp=fopen(filename, "a");
+  fp = fopen(filename, "a");
   if (fp == NULL) {
     printf("error appending to file %s..\n", filename);
     return -1;
@@ -37,23 +36,19 @@ int appenddata(const char *filename, int size, double time) {
   return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
   LatticeDim ret;
 
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
   ret.lo = lo;
   ret.h = h;
 
   return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
+Lattice *create_lattice(LatticeDim dim) {
   int size;
   Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
@@ -76,10 +71,7 @@ create_lattice(LatticeDim dim)
   return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
+void destroy_lattice(Lattice *lat) {
   if (lat) {
     free(lat->lattice);
     free(lat);
@@ -91,13 +83,13 @@ int main(int argc, char *argv[]) {
 
   LatticeDim lattice_dim;
   Lattice *gpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
   int n;
 
@@ -137,9 +129,10 @@ int main(int argc, char *argv[]) {
   printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
 
   printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
 
   lattice_dim = lattice_from_bounding_box(lo, hi, h);
   gpu_lattice = create_lattice(lattice_dim);
@@ -149,7 +142,8 @@ int main(int argc, char *argv[]) {
    *  Run OpenCL kernel
    *  (Begin and end with COMPUTE timer active)
    */
-  if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0)) {
+  if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom,
+                                           0)) {
     fprintf(stderr, "Computation failed\n");
     exit(1);
   }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h
index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/ocl.h
@@ -2,14 +2,13 @@
 #define __OCLH__
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c
index 36ee7e2b06e7650a1d096f2f3f80f8894f24cdf8..73fa63903a84d3cc741917d020f198133d898062 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.c
@@ -6,18 +6,16 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <inttypes.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -38,9 +36,9 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
   }
@@ -49,7 +47,7 @@ write_lattice_summary(const char *filename, Lattice *lattice)
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
   }
 
@@ -58,8 +56,8 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h
index 13022cd9e80843157cc78d7d2ff12afa85a0f826..f6c24bfc80bc63d0236d69577f832984c74a9eac 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_base/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c
index f0fbdc79f25679053ae2b8fbcd997db178b5a4d4..475a4666e1a6366873dc49d18d311b76ef6cde38 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutcpu.c
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
@@ -18,15 +18,14 @@
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -41,8 +40,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -64,7 +63,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -77,44 +76,45 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -125,26 +125,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -152,27 +159,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
             *pg += e;
           }
 #endif
@@ -180,7 +186,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -188,8 +194,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h
index 955c788f658ae823e103ea4d040ba4f8c6179fef..13378e5e9be17209476e71e749b44be6733bb8d9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff.h
@@ -15,54 +15,51 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
 
-    /* Lowest corner of lattice */
-    Vec3 lo;
+  /* Lowest corner of lattice */
+  Vec3 lo;
 
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
 
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
 
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
 
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
 
-  int gpu_compute_cutoff_potential_lattice6overlap(
-      struct pb_TimerSet *timers,        /* for measuring execution time */
-      Lattice *lattice,
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms,                      /* array of atoms */
-      int verbose                        /* print info/debug messages */
-    );
+int gpu_compute_cutoff_potential_lattice6overlap(
+    struct pb_TimerSet *timers,     /* for measuring execution time */
+    Lattice *lattice, float cutoff, /* cutoff distance */
+    Atoms *atoms,                   /* array of atoms */
+    int verbose                     /* print info/debug messages */
+);
 
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
 
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c
index 15e7aae1e160eb242d10c31911fee3fefdb50889..06f856c1a0fa43dc95cb896450baa42f74c047fd 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/cutoff6overlap.c
@@ -7,19 +7,19 @@
  ***************************************************************************/
 #include <CL/cl.h>
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
-#include "ocl.h"
 #include "macros.h"
+#include "ocl.h"
 
-//OpenCL v1.0
-//cl_int3 not defined
+// OpenCL v1.0
+// cl_int3 not defined
 #ifdef CL_VERSION_1_1
 #if CL_VERSION_1_1 != 1
 typedef cl_int4 cl_int3;
@@ -37,15 +37,13 @@ const cl_version_check = 0;
 // we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
 typedef cl_int4 xyz;
 
-//extern "C" int gpu_compute_cutoff_potential_lattice6overlap(
+// extern "C" int gpu_compute_cutoff_potential_lattice6overlap(
 int gpu_compute_cutoff_potential_lattice6overlap(
-    struct pb_TimerSet *timers,        /* for measuring execution time */
-    Lattice *lattice,
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms,                      /* array of atoms */
-    int verbose                        /* print info/debug messages */
-    )
-{
+    struct pb_TimerSet *timers,     /* for measuring execution time */
+    Lattice *lattice, float cutoff, /* cutoff distance */
+    Atoms *atoms,                   /* array of atoms */
+    int verbose                     /* print info/debug messages */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -59,8 +57,8 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   xyz nbrlist[NBRLIST_MAXLEN];
   int nbrlistlen = 0;
 
-  int binHistoFull[BIN_DEPTH+1] = { 0 };   /* clear every array element */
-  int binHistoCover[BIN_DEPTH+1] = { 0 };  /* clear every array element */
+  int binHistoFull[BIN_DEPTH + 1] = {0};  /* clear every array element */
+  int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */
   int num_excluded = 0;
 
   int xRegionDim, yRegionDim, zRegionDim;
@@ -92,16 +90,16 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   size_t gridDim[3], blockDim[3];
 
 #ifdef NEIGHBOR_COUNT
-  double neighbor_count = 0;	/* used to profile the number of atoms near a
-				 * lattice point */
+  double neighbor_count = 0; /* used to profile the number of atoms near a
+                              * lattice point */
 #endif
 
   // Caller has made the "compute" timer active
 
   /* pad lattice to be factor of 8 in each dimension */
-  xRegionDim = (int) ceilf(nx/8.f);
-  yRegionDim = (int) ceilf(ny/8.f);
-  zRegionDim = (int) ceilf(nz/8.f);
+  xRegionDim = (int)ceilf(nx / 8.f);
+  yRegionDim = (int)ceilf(ny / 8.f);
+  zRegionDim = (int)ceilf(nz / 8.f);
 
   lnx = 8 * xRegionDim;
   lny = 8 * yRegionDim;
@@ -109,35 +107,36 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   lnall = lnx * lny * lnz;
 
   /* will receive energies from OpenCL */
-  regionZeroAddr = (ener_t *) malloc(lnall * sizeof(float));
+  regionZeroAddr = (ener_t *)malloc(lnall * sizeof(float));
 
   /* create bins */
-  c = (int) ceil(cutoff * BIN_INVLEN);  /* count extra bins around lattice */
-  binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
-  binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
-  binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
+  c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
+  binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c;
+  binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c;
+  binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c;
   nbins = binDim.x * binDim.y * binDim.z;
-  binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
+  binBaseAddr = (cl_float4 *)calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
   binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
 
-  bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
+  bincntBaseAddr = (int *)calloc(nbins, sizeof(int));
   bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
 
   /* create neighbor list */
-  if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
+  if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) {
     float s = sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
     int cnt = 0;
     /* develop neighbor list around 1 cell */
-    if (2*c + 1 > NBRLIST_DIM) {
+    if (2 * c + 1 > NBRLIST_DIM) {
       fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-1)/2 * BIN_LENGTH);
+              (NBRLIST_DIM - 1) / 2 * BIN_LENGTH);
       return -1;
     }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
           nbrlist[cnt].x = i;
           nbrlist[cnt].y = j;
           nbrlist[cnt].z = k;
@@ -146,21 +145,21 @@ int gpu_compute_cutoff_potential_lattice6overlap(
       }
     }
     nbrlistlen = cnt;
-  }
-  else if (8*h <= 2*BIN_LENGTH) {
-    float s = 2.f*sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+  } else if (8 * h <= 2 * BIN_LENGTH) {
+    float s = 2.f * sqrtf(3);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
     int cnt = 0;
     /* develop neighbor list around 3-cube of cells */
-    if (2*c + 3 > NBRLIST_DIM) {
+    if (2 * c + 3 > NBRLIST_DIM) {
       fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-3)/2 * BIN_LENGTH);
+              (NBRLIST_DIM - 3) / 2 * BIN_LENGTH);
       return -1;
     }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
           nbrlist[cnt].x = i;
           nbrlist[cnt].y = j;
           nbrlist[cnt].z = k;
@@ -169,8 +168,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
       }
     }
     nbrlistlen = cnt;
-  }
-  else {
+  } else {
     fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
     return -1;
   }
@@ -178,43 +176,39 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   /* perform geometric hashing of atoms into bins */
   {
     /* array of extra atoms, permit average of one extra per bin */
-    Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
+    Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom));
     int extra_len = 0;
-    
-    for (n = 0;  n < natoms;  n++) {
+
+    for (n = 0; n < natoms; n++) {
       cl_float4 p;
       p.x = atom[n].x - xlo;
       p.y = atom[n].y - ylo;
       p.z = atom[n].z - zlo;
       p.w = atom[n].q;
-      i = (int) floorf(p.x * BIN_INVLEN);
-      j = (int) floorf(p.y * BIN_INVLEN);
-      k = (int) floorf(p.z * BIN_INVLEN);
-      if (i >= -c && i < binDim.x - c &&
-	  j >= -c && j < binDim.y - c &&
-	  k >= -c && k < binDim.z - c &&
-	  atom[n].q != 0) {
-	int index = (k * binDim.y + j) * binDim.x + i;
-	cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
-	int bindex = bincntZeroAddr[index];
-	if (bindex < BIN_DEPTH) {
-	  /* copy atom into bin and increase counter for this bin */
-	  bin[bindex] = p;
-	  bincntZeroAddr[index]++;
-	}
-	else {
-	  /* add index to array of extra atoms to be computed with CPU */
-	  if (extra_len >= nbins) {
-	    fprintf(stderr, "exceeded space for storing extra atoms\n");
-	    return -1;
-	  }
-	  extra_atoms[extra_len] = atom[n];
-	  extra_len++;
-	}
-      }
-      else {
-	/* excluded atoms are either outside bins or neutrally charged */
-	num_excluded++;
+      i = (int)floorf(p.x * BIN_INVLEN);
+      j = (int)floorf(p.y * BIN_INVLEN);
+      k = (int)floorf(p.z * BIN_INVLEN);
+      if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c &&
+          k >= -c && k < binDim.z - c && atom[n].q != 0) {
+        int index = (k * binDim.y + j) * binDim.x + i;
+        cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
+        int bindex = bincntZeroAddr[index];
+        if (bindex < BIN_DEPTH) {
+          /* copy atom into bin and increase counter for this bin */
+          bin[bindex] = p;
+          bincntZeroAddr[index]++;
+        } else {
+          /* add index to array of extra atoms to be computed with CPU */
+          if (extra_len >= nbins) {
+            fprintf(stderr, "exceeded space for storing extra atoms\n");
+            return -1;
+          }
+          extra_atoms[extra_len] = atom[n];
+          extra_len++;
+        }
+      } else {
+        /* excluded atoms are either outside bins or neutrally charged */
+        num_excluded++;
       }
     }
 
@@ -226,24 +220,24 @@ int gpu_compute_cutoff_potential_lattice6overlap(
 
   /* bin stats */
   sum = total = 0;
-  for (n = 0;  n < nbins;  n++) {
-    binHistoFull[ bincntBaseAddr[n] ]++;
+  for (n = 0; n < nbins; n++) {
+    binHistoFull[bincntBaseAddr[n]]++;
     sum += bincntBaseAddr[n];
     total += BIN_DEPTH;
   }
-  avgFillFull = sum / (float) total;
+  avgFillFull = sum / (float)total;
   sum = total = 0;
-  for (k = 0;  k < binDim.z - 2*c;  k++) {
-    for (j = 0;  j < binDim.y - 2*c;  j++) {
-      for (i = 0;  i < binDim.x - 2*c;  i++) {
+  for (k = 0; k < binDim.z - 2 * c; k++) {
+    for (j = 0; j < binDim.y - 2 * c; j++) {
+      for (i = 0; i < binDim.x - 2 * c; i++) {
         int index = (k * binDim.y + j) * binDim.x + i;
-        binHistoCover[ bincntZeroAddr[index] ]++;
+        binHistoCover[bincntZeroAddr[index]]++;
         sum += bincntZeroAddr[index];
         total += BIN_DEPTH;
       }
     }
   }
-  avgFillCover = sum / (float) total;
+  avgFillCover = sum / (float)total;
 
   if (verbose) {
     /* report */
@@ -252,25 +246,25 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("cutoff distance = %g\n", cutoff);
     printf("\n");
     printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
-    printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
+    printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h);
     printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
-    printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
-    printf("number of bytes for lattice data = %u\n", lnall*sizeof(float));
+    printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h);
+    printf("number of bytes for lattice data = %u\n", lnall * sizeof(float));
     printf("\n");
     printf("bin padding thickness = %d\n", c);
-    printf("bin cover dimensions = %d %d %d\n",
-        binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
+    printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c,
+           binDim.y - 2 * c, binDim.z - 2 * c);
     printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
     printf("number of bins = %d\n", nbins);
     printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
     printf("%% overhead space = %g\n",
-        (natoms / (double) (nbins * BIN_DEPTH)) * 100);
+           (natoms / (double)(nbins * BIN_DEPTH)) * 100);
     printf("number of bytes for bin data = %u\n",
-        nbins * BIN_DEPTH * sizeof(cl_float4));
+           nbins * BIN_DEPTH * sizeof(cl_float4));
     printf("\n");
     printf("bin histogram with padding:\n");
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       printf("     number of bins with %d atoms:  %d\n", n, binHistoFull[n]);
       sum += binHistoFull[n];
     }
@@ -279,7 +273,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("\n");
     printf("bin histogram excluding padding:\n");
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       printf("     number of bins with %d atoms:  %d\n", n, binHistoCover[n]);
       sum += binHistoCover[n];
     }
@@ -287,125 +281,145 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("     %% average fill:  %g\n", avgFillCover * 100);
     printf("\n");
     printf("number of extra atoms = %d\n", extra->size);
-    printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
+    printf("%% atoms that are extra = %g\n",
+           (extra->size / (double)natoms) * 100);
     printf("\n");
 
     /* sanity check on bins */
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       sum += n * binHistoFull[n];
     }
     sum += extra->size + num_excluded;
     printf("sanity check on bin histogram with edges:  "
-        "sum + others = %d\n", sum);
+           "sum + others = %d\n",
+           sum);
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       sum += n * binHistoCover[n];
     }
     sum += extra->size + num_excluded;
     printf("sanity check on bin histogram excluding edges:  "
-        "sum + others = %d\n", sum);
+           "sum + others = %d\n",
+           sum);
     printf("\n");
 
     /* neighbor list */
     printf("neighbor list length = %d\n", nbrlistlen);
     printf("\n");
   }
-  
+
   cl_int clStatus;
 
   cl_uint numPlatforms;
-  clStatus  = clGetPlatformIDs(0, NULL, &numPlatforms);
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
   clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  cl_context clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
- 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
-  
-  const char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+
+  const char *clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_nvidia -DVERSION_CHECK=%d", cl_version_check);  //-cl-nv-verbose
+  sprintf(clOptions, "-I src/opencl_nvidia -DVERSION_CHECK=%d",
+          cl_version_check); //-cl-nv-verbose
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
-  
-  cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice6overlap",&clStatus);
+
+  cl_kernel clKernel = clCreateKernel(
+      clProgram, "opencl_cutoff_potential_lattice6overlap", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
   /* setup OpenCL kernel parameters */
   blockDim[0] = 8;
   blockDim[1] = 2;
   blockDim[2] = 8;
-  gridDim[0] = xRegionDim*blockDim[0];
-  gridDim[1] = yRegionDim*blockDim[1];
-  gridDim[2] = 1*blockDim[2];
+  gridDim[0] = xRegionDim * blockDim[0];
+  gridDim[1] = yRegionDim * blockDim[1];
+  gridDim[2] = 1 * blockDim[2];
 
   /* allocate and initialize memory on OpenCL device */
   pb_SwitchToTimer(timers, pb_TimerID_COPY);
   if (verbose) {
     printf("Allocating %.2fMB on OpenCL device for potentials\n",
-           lnall * sizeof(float) / (double) (1024*1024));
+           lnall * sizeof(float) / (double)(1024 * 1024));
   }
-  
-  regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(ener_t),NULL,&clStatus);
+
+  regionZeroCl = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                                lnall * sizeof(ener_t), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
-  clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(ener_t));
+  clMemSet(clCommandQueue, regionZeroCl, 0, lnall * sizeof(ener_t));
 
   if (verbose) {
     printf("Allocating %.2fMB on OpenCL device for atom bins\n",
-           nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
+           nbins * BIN_DEPTH * sizeof(cl_float4) / (double)(1024 * 1024));
   }
 
-  binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
+  binBaseCl =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     nbins * BIN_DEPTH * sizeof(cl_float4), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
- 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, binBaseCl, CL_TRUE, 0,
+                                  nbins * BIN_DEPTH * sizeof(cl_float4),
+                                  binBaseAddr, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  //Sub buffers are not supported in OpenCL v1.0
-  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;  
+  // Sub buffers are not supported in OpenCL v1.0
+  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
 
-  NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
+  NbrListLen =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(int), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, NbrListLen, CL_TRUE, 0,
+                                  sizeof(int), &nbrlistlen, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
+  NbrList = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                           NBRLIST_MAXLEN * sizeof(xyz), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, NbrList, CL_TRUE, 0,
+                           nbrlistlen * sizeof(xyz), nbrlist, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  if (verbose) 
+  if (verbose)
     printf("\n");
 
   pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
-  clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
-  clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(int), &(binDim.x));
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), &(binDim.y));
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &binBaseCl);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &offset);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(float), &h);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(float), &cutoff2);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(float), &inv_cutoff2);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &regionZeroCl);
+  clStatus = clSetKernelArg(clKernel, 9, sizeof(cl_mem), &NbrListLen);
+  clStatus = clSetKernelArg(clKernel, 10, sizeof(cl_mem), &NbrList);
   CHECK_ERROR("clSetKernelArg")
 
   /*cl_command_queue cutoffstream;*/
@@ -414,21 +428,22 @@ int gpu_compute_cutoff_potential_lattice6overlap(
 
   /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
   pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
-  if(verbose)
+  if (verbose)
     printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
-  for (zRegionIndex = 0;  zRegionIndex < zRegionDim;  zRegionIndex++) {
+  for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
 #ifndef NO_DEBUG
     printf("  computing plane %d\n", zRegionIndex);
-#endif 
-    clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
+#endif
+    clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &zRegionIndex);
     CHECK_ERROR("clSetKernelArg")
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL,
+                                      gridDim, blockDim, 0, NULL, NULL);
     CHECK_ERROR("clEnqueueNDRangeKernel")
     clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
   }
 
-  /* 
+  /*
    * handle extra atoms on the CPU, concurrently with the GPU calculations
    */
 
@@ -437,10 +452,10 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("computing extra atoms on CPU\n");
     if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
       fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
-          "for extra atoms\n");
+                      "for extra atoms\n");
       return -1;
     }
-    if(verbose)
+    if (verbose)
       printf("\n");
   }
 
@@ -458,7 +473,9 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   /* copy result regions from OpenCL device */
   pb_SwitchToTimer(timers, pb_TimerID_COPY);
 
-  clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(ener_t),regionZeroAddr,0,NULL,NULL);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, regionZeroCl, CL_TRUE, 0,
+                                 lnall * sizeof(ener_t), regionZeroAddr, 0,
+                                 NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
   /* free OpenCL memory allocations */
@@ -473,28 +490,29 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   clStatus = clReleaseCommandQueue(clCommandQueue);
   clStatus = clReleaseContext(clContext);
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   /*
    * transpose on CPU, updating, producing the final lattice
    */
   /* transpose regions back into lattice */
   pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-  for (k = 0;  k < nz;  k++) {
+  for (k = 0; k < nz; k++) {
     zRegionIndex = (k >> 3);
     zOffset = (k & 7);
 
-    for (j = 0;  j < ny;  j++) {
+    for (j = 0; j < ny; j++) {
       yRegionIndex = (j >> 3);
       yOffset = (j & 7);
 
-      for (i = 0;  i < nx;  i++) {
+      for (i = 0; i < nx; i++) {
         xRegionIndex = (i >> 3);
         xOffset = (i & 7);
 
-        thisRegion = regionZeroAddr
-          + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
-              + xRegionIndex) * REGION_SIZE;
+        thisRegion = regionZeroAddr +
+                     ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim +
+                      xRegionIndex) *
+                         REGION_SIZE;
 
         indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
         index = (k * ny + j) * nx + i;
@@ -502,7 +520,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
 #ifndef NEIGHBOR_COUNT
         lattice->lattice[index] += thisRegion[indexRegion];
 #else
-	neighbor_count += thisRegion[indexRegion];
+        neighbor_count += thisRegion[indexRegion];
 #endif
       }
     }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c
index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/excl.c
@@ -6,24 +6,22 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -35,8 +33,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -64,44 +62,45 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -112,42 +111,49 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h
index 513e65d64f72d2b9603a9d7e594417feffb324a5..2bd0ad46d3ac72073a85e97d3c7b51fc999fb006 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/macros.h
@@ -4,22 +4,24 @@
 #ifdef __DEVICE_EMULATION__
 #define DEBUG
 /* define which grid block and which thread to examine */
-#define BX  0
-#define BY  0
-#define TX  0
-#define TY  0
-#define TZ  0
-#define EMU(code) do { \
-  if (blockIdx.x==BX && blockIdx.y==BY && \
-      threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
-    code; \
-  } \
-} while (0)
-#define INT(n)    printf("%s = %d\n", #n, n)
-#define FLOAT(f)  printf("%s = %g\n", #f, (double)(f))
-#define INT3(n)   printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
-#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
-    (double)(f).y, (double)(f).z, (double)(f).w)
+#define BX 0
+#define BY 0
+#define TX 0
+#define TY 0
+#define TZ 0
+#define EMU(code)                                                              \
+  do {                                                                         \
+    if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX &&           \
+        threadIdx.y == TY && threadIdx.z == TZ) {                              \
+      code;                                                                    \
+    }                                                                          \
+  } while (0)
+#define INT(n) printf("%s = %d\n", #n, n)
+#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
+#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
+#define FLOAT4(f)                                                              \
+  printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y,               \
+         (double)(f).z, (double)(f).w)
 #else
 #define EMU(code)
 #define INT(n)
@@ -29,13 +31,12 @@
 #endif
 
 // report error from OpenCL
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Errorcode = %d\n", clStatus);  \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Errorcode = %d\n", clStatus);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #undef OPENCL11
@@ -48,7 +49,7 @@
  * reserve enough memory for 11^3 stencil of grid cells
  * this fits within 16K of memory
  */
-#define NBRLIST_DIM  11
+#define NBRLIST_DIM 11
 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
 
 /* Normally, we're summing electrostatic potential.  However, for
@@ -57,7 +58,7 @@
  */
 #undef NEIGHBOR_COUNT
 //#define NEIGHBOR_COUNT
- 
+
 #ifndef NEIGHBOR_COUNT
 typedef float ener_t;
 #else
@@ -70,16 +71,16 @@ typedef int ener_t;
  * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
  * should permit scheduling of up to 3 thread blocks per SM
  */
-#define BIN_DEPTH         8  /* max number of atoms per bin */
-#define BIN_SIZE         32  /* size of bin in floats */
-#define BIN_SHIFT         5  /* # of bits to shift for mul/div by BIN_SIZE */
-#define BIN_CACHE_MAXLEN 32  /* max number of atom bins to cache */
+#define BIN_DEPTH 8         /* max number of atoms per bin */
+#define BIN_SIZE 32         /* size of bin in floats */
+#define BIN_SHIFT 5         /* # of bits to shift for mul/div by BIN_SIZE */
+#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
 
-#define BIN_LENGTH      4.f  /* spatial length in Angstroms */
-#define BIN_INVLEN  (1.f / BIN_LENGTH)
+#define BIN_LENGTH 4.f /* spatial length in Angstroms */
+#define BIN_INVLEN (1.f / BIN_LENGTH)
 /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
  * so that bin fill should be 80% (for non-empty regions of space) */
 
-#define REGION_SIZE     512  /* number of floats in lattice region */
+#define REGION_SIZE 512 /* number of floats in lattice region */
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c
index 1e00f3e562d12e4bfd628a497eb56e03cfa9e2f4..bae7ca7339d41724520e1242a9b4d154c1cb073c 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/main.c
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
@@ -18,16 +18,15 @@
 
 #define ERRTOL 1e-4f
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
 int appenddata(const char *filename, int size, double time) {
   FILE *fp;
-  fp=fopen(filename, "a");
+  fp = fopen(filename, "a");
   if (fp == NULL) {
     printf("error appending to file %s..\n", filename);
     return -1;
@@ -37,23 +36,19 @@ int appenddata(const char *filename, int size, double time) {
   return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
   LatticeDim ret;
 
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
   ret.lo = lo;
   ret.h = h;
 
   return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
+Lattice *create_lattice(LatticeDim dim) {
   int size;
   Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
@@ -76,10 +71,7 @@ create_lattice(LatticeDim dim)
   return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
+void destroy_lattice(Lattice *lat) {
   if (lat) {
     free(lat->lattice);
     free(lat);
@@ -91,13 +83,13 @@ int main(int argc, char *argv[]) {
 
   LatticeDim lattice_dim;
   Lattice *gpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
   int n;
 
@@ -138,9 +130,10 @@ int main(int argc, char *argv[]) {
   printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
 
   printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
 
   lattice_dim = lattice_from_bounding_box(lo, hi, h);
   gpu_lattice = create_lattice(lattice_dim);
@@ -149,7 +142,8 @@ int main(int argc, char *argv[]) {
    *  OpenCL kernel, with overlapped GPU/CPU computation
    *  (Enter and exit the function with the COMPUTE timer active)
    */
-  if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) {
+  if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff,
+                                                   atom, 0)) {
     fprintf(stderr, "Computation failed\n");
     exit(1);
   }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h
index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/ocl.h
@@ -2,14 +2,13 @@
 #define __OCLH__
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c
index ac45761fb86afd598dfe24f2ecead5622cf00954..145f59cc065131db3461a04f9674a94afbf0cfb5 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.c
@@ -6,18 +6,16 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <inttypes.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -38,21 +36,21 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
-    //fprintf(outfile,"%f\n",tmp);
+    // fprintf(outfile,"%f\n",tmp);
   }
 
   /* Write the size of a lattice plane */
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
-    //fprintf(outfile,"%u\n",tmp);
+    // fprintf(outfile,"%u\n",tmp);
   }
 
   /* Write the plane of lattice data at z=0 and z = nz-1 */
@@ -60,11 +58,11 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
-//int i;
-   //for(i=0;i<100;i++)
-	//fprintf(outfile,"%f ",lattice_data[i]);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
+    // int i;
+    // for(i=0;i<100;i++)
+    // fprintf(outfile,"%f ",lattice_data[i]);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h
index 2ddd39227e6c043207897e923f9c7076452eff52..78a5f846e2feda2d1142ae0e1ea4f5edb4eb5ad6 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_cpu_baseline/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c
index f0fbdc79f25679053ae2b8fbcd997db178b5a4d4..475a4666e1a6366873dc49d18d311b76ef6cde38 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutcpu.c
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
@@ -18,15 +18,14 @@
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -41,8 +40,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -64,7 +63,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -77,44 +76,45 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -125,26 +125,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -152,27 +159,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
             *pg += e;
           }
 #endif
@@ -180,7 +186,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -188,8 +194,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h
index 955c788f658ae823e103ea4d040ba4f8c6179fef..13378e5e9be17209476e71e749b44be6733bb8d9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff.h
@@ -15,54 +15,51 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
 
-    /* Lowest corner of lattice */
-    Vec3 lo;
+  /* Lowest corner of lattice */
+  Vec3 lo;
 
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
 
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
 
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
 
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
 
-  int gpu_compute_cutoff_potential_lattice6overlap(
-      struct pb_TimerSet *timers,        /* for measuring execution time */
-      Lattice *lattice,
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms,                      /* array of atoms */
-      int verbose                        /* print info/debug messages */
-    );
+int gpu_compute_cutoff_potential_lattice6overlap(
+    struct pb_TimerSet *timers,     /* for measuring execution time */
+    Lattice *lattice, float cutoff, /* cutoff distance */
+    Atoms *atoms,                   /* array of atoms */
+    int verbose                     /* print info/debug messages */
+);
 
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
 
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c
index be32a48fc61636bfaeff041d831a93b9c18e708f..96ebeafbdf377a2d2e6e8e7f2cf5e1e58a3e7a6a 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/cutoff6overlap.c
@@ -7,19 +7,19 @@
  ***************************************************************************/
 #include <CL/cl.h>
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
-#include "ocl.h"
 #include "macros.h"
+#include "ocl.h"
 
-//OpenCL v1.0
-//cl_int3 not defined
+// OpenCL v1.0
+// cl_int3 not defined
 #ifdef CL_VERSION_1_1
 #if CL_VERSION_1_1 != 1
 typedef cl_int4 cl_int3;
@@ -37,15 +37,13 @@ const cl_version_check = 0;
 // we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
 typedef cl_int4 xyz;
 
-//extern "C" int gpu_compute_cutoff_potential_lattice6overlap(
+// extern "C" int gpu_compute_cutoff_potential_lattice6overlap(
 int gpu_compute_cutoff_potential_lattice6overlap(
-    struct pb_TimerSet *timers,        /* for measuring execution time */
-    Lattice *lattice,
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms,                      /* array of atoms */
-    int verbose                        /* print info/debug messages */
-    )
-{
+    struct pb_TimerSet *timers,     /* for measuring execution time */
+    Lattice *lattice, float cutoff, /* cutoff distance */
+    Atoms *atoms,                   /* array of atoms */
+    int verbose                     /* print info/debug messages */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -59,8 +57,8 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   xyz nbrlist[NBRLIST_MAXLEN];
   int nbrlistlen = 0;
 
-  int binHistoFull[BIN_DEPTH+1] = { 0 };   /* clear every array element */
-  int binHistoCover[BIN_DEPTH+1] = { 0 };  /* clear every array element */
+  int binHistoFull[BIN_DEPTH + 1] = {0};  /* clear every array element */
+  int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */
   int num_excluded = 0;
 
   int xRegionDim, yRegionDim, zRegionDim;
@@ -92,16 +90,16 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   size_t gridDim[3], blockDim[3];
 
 #ifdef NEIGHBOR_COUNT
-  double neighbor_count = 0;	/* used to profile the number of atoms near a
-				 * lattice point */
+  double neighbor_count = 0; /* used to profile the number of atoms near a
+                              * lattice point */
 #endif
 
   // Caller has made the "compute" timer active
 
   /* pad lattice to be factor of 8 in each dimension */
-  xRegionDim = (int) ceilf(nx/8.f);
-  yRegionDim = (int) ceilf(ny/8.f);
-  zRegionDim = (int) ceilf(nz/8.f);
+  xRegionDim = (int)ceilf(nx / 8.f);
+  yRegionDim = (int)ceilf(ny / 8.f);
+  zRegionDim = (int)ceilf(nz / 8.f);
 
   lnx = 8 * xRegionDim;
   lny = 8 * yRegionDim;
@@ -109,35 +107,36 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   lnall = lnx * lny * lnz;
 
   /* will receive energies from OpenCL */
-  regionZeroAddr = (ener_t *) malloc(lnall * sizeof(float));
+  regionZeroAddr = (ener_t *)malloc(lnall * sizeof(float));
 
   /* create bins */
-  c = (int) ceil(cutoff * BIN_INVLEN);  /* count extra bins around lattice */
-  binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
-  binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
-  binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
+  c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
+  binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c;
+  binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c;
+  binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c;
   nbins = binDim.x * binDim.y * binDim.z;
-  binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
+  binBaseAddr = (cl_float4 *)calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
   binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
 
-  bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
+  bincntBaseAddr = (int *)calloc(nbins, sizeof(int));
   bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
 
   /* create neighbor list */
-  if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
+  if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) {
     float s = sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
     int cnt = 0;
     /* develop neighbor list around 1 cell */
-    if (2*c + 1 > NBRLIST_DIM) {
+    if (2 * c + 1 > NBRLIST_DIM) {
       fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-1)/2 * BIN_LENGTH);
+              (NBRLIST_DIM - 1) / 2 * BIN_LENGTH);
       return -1;
     }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
           nbrlist[cnt].x = i;
           nbrlist[cnt].y = j;
           nbrlist[cnt].z = k;
@@ -146,21 +145,21 @@ int gpu_compute_cutoff_potential_lattice6overlap(
       }
     }
     nbrlistlen = cnt;
-  }
-  else if (8*h <= 2*BIN_LENGTH) {
-    float s = 2.f*sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+  } else if (8 * h <= 2 * BIN_LENGTH) {
+    float s = 2.f * sqrtf(3);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
     int cnt = 0;
     /* develop neighbor list around 3-cube of cells */
-    if (2*c + 3 > NBRLIST_DIM) {
+    if (2 * c + 3 > NBRLIST_DIM) {
       fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-3)/2 * BIN_LENGTH);
+              (NBRLIST_DIM - 3) / 2 * BIN_LENGTH);
       return -1;
     }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
           nbrlist[cnt].x = i;
           nbrlist[cnt].y = j;
           nbrlist[cnt].z = k;
@@ -169,8 +168,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
       }
     }
     nbrlistlen = cnt;
-  }
-  else {
+  } else {
     fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
     return -1;
   }
@@ -178,43 +176,39 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   /* perform geometric hashing of atoms into bins */
   {
     /* array of extra atoms, permit average of one extra per bin */
-    Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
+    Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom));
     int extra_len = 0;
-    
-    for (n = 0;  n < natoms;  n++) {
+
+    for (n = 0; n < natoms; n++) {
       cl_float4 p;
       p.x = atom[n].x - xlo;
       p.y = atom[n].y - ylo;
       p.z = atom[n].z - zlo;
       p.w = atom[n].q;
-      i = (int) floorf(p.x * BIN_INVLEN);
-      j = (int) floorf(p.y * BIN_INVLEN);
-      k = (int) floorf(p.z * BIN_INVLEN);
-      if (i >= -c && i < binDim.x - c &&
-	  j >= -c && j < binDim.y - c &&
-	  k >= -c && k < binDim.z - c &&
-	  atom[n].q != 0) {
-	int index = (k * binDim.y + j) * binDim.x + i;
-	cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
-	int bindex = bincntZeroAddr[index];
-	if (bindex < BIN_DEPTH) {
-	  /* copy atom into bin and increase counter for this bin */
-	  bin[bindex] = p;
-	  bincntZeroAddr[index]++;
-	}
-	else {
-	  /* add index to array of extra atoms to be computed with CPU */
-	  if (extra_len >= nbins) {
-	    fprintf(stderr, "exceeded space for storing extra atoms\n");
-	    return -1;
-	  }
-	  extra_atoms[extra_len] = atom[n];
-	  extra_len++;
-	}
-      }
-      else {
-	/* excluded atoms are either outside bins or neutrally charged */
-	num_excluded++;
+      i = (int)floorf(p.x * BIN_INVLEN);
+      j = (int)floorf(p.y * BIN_INVLEN);
+      k = (int)floorf(p.z * BIN_INVLEN);
+      if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c &&
+          k >= -c && k < binDim.z - c && atom[n].q != 0) {
+        int index = (k * binDim.y + j) * binDim.x + i;
+        cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
+        int bindex = bincntZeroAddr[index];
+        if (bindex < BIN_DEPTH) {
+          /* copy atom into bin and increase counter for this bin */
+          bin[bindex] = p;
+          bincntZeroAddr[index]++;
+        } else {
+          /* add index to array of extra atoms to be computed with CPU */
+          if (extra_len >= nbins) {
+            fprintf(stderr, "exceeded space for storing extra atoms\n");
+            return -1;
+          }
+          extra_atoms[extra_len] = atom[n];
+          extra_len++;
+        }
+      } else {
+        /* excluded atoms are either outside bins or neutrally charged */
+        num_excluded++;
       }
     }
 
@@ -226,24 +220,24 @@ int gpu_compute_cutoff_potential_lattice6overlap(
 
   /* bin stats */
   sum = total = 0;
-  for (n = 0;  n < nbins;  n++) {
-    binHistoFull[ bincntBaseAddr[n] ]++;
+  for (n = 0; n < nbins; n++) {
+    binHistoFull[bincntBaseAddr[n]]++;
     sum += bincntBaseAddr[n];
     total += BIN_DEPTH;
   }
-  avgFillFull = sum / (float) total;
+  avgFillFull = sum / (float)total;
   sum = total = 0;
-  for (k = 0;  k < binDim.z - 2*c;  k++) {
-    for (j = 0;  j < binDim.y - 2*c;  j++) {
-      for (i = 0;  i < binDim.x - 2*c;  i++) {
+  for (k = 0; k < binDim.z - 2 * c; k++) {
+    for (j = 0; j < binDim.y - 2 * c; j++) {
+      for (i = 0; i < binDim.x - 2 * c; i++) {
         int index = (k * binDim.y + j) * binDim.x + i;
-        binHistoCover[ bincntZeroAddr[index] ]++;
+        binHistoCover[bincntZeroAddr[index]]++;
         sum += bincntZeroAddr[index];
         total += BIN_DEPTH;
       }
     }
   }
-  avgFillCover = sum / (float) total;
+  avgFillCover = sum / (float)total;
 
   if (verbose) {
     /* report */
@@ -252,25 +246,25 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("cutoff distance = %g\n", cutoff);
     printf("\n");
     printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
-    printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
+    printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h);
     printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
-    printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
-    printf("number of bytes for lattice data = %u\n", lnall*sizeof(float));
+    printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h);
+    printf("number of bytes for lattice data = %u\n", lnall * sizeof(float));
     printf("\n");
     printf("bin padding thickness = %d\n", c);
-    printf("bin cover dimensions = %d %d %d\n",
-        binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
+    printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c,
+           binDim.y - 2 * c, binDim.z - 2 * c);
     printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
     printf("number of bins = %d\n", nbins);
     printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
     printf("%% overhead space = %g\n",
-        (natoms / (double) (nbins * BIN_DEPTH)) * 100);
+           (natoms / (double)(nbins * BIN_DEPTH)) * 100);
     printf("number of bytes for bin data = %u\n",
-        nbins * BIN_DEPTH * sizeof(cl_float4));
+           nbins * BIN_DEPTH * sizeof(cl_float4));
     printf("\n");
     printf("bin histogram with padding:\n");
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       printf("     number of bins with %d atoms:  %d\n", n, binHistoFull[n]);
       sum += binHistoFull[n];
     }
@@ -279,7 +273,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("\n");
     printf("bin histogram excluding padding:\n");
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       printf("     number of bins with %d atoms:  %d\n", n, binHistoCover[n]);
       sum += binHistoCover[n];
     }
@@ -287,122 +281,141 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("     %% average fill:  %g\n", avgFillCover * 100);
     printf("\n");
     printf("number of extra atoms = %d\n", extra->size);
-    printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
+    printf("%% atoms that are extra = %g\n",
+           (extra->size / (double)natoms) * 100);
     printf("\n");
 
     /* sanity check on bins */
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       sum += n * binHistoFull[n];
     }
     sum += extra->size + num_excluded;
     printf("sanity check on bin histogram with edges:  "
-        "sum + others = %d\n", sum);
+           "sum + others = %d\n",
+           sum);
     sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
+    for (n = 0; n <= BIN_DEPTH; n++) {
       sum += n * binHistoCover[n];
     }
     sum += extra->size + num_excluded;
     printf("sanity check on bin histogram excluding edges:  "
-        "sum + others = %d\n", sum);
+           "sum + others = %d\n",
+           sum);
     printf("\n");
 
     /* neighbor list */
     printf("neighbor list length = %d\n", nbrlistlen);
     printf("\n");
   }
-  
+
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs");
 
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
-  
-  const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+
+  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_nvidia -DVERSION_CHECK=%d", cl_version_check);  //-cl-nv-verbose
+  sprintf(clOptions, "-I src/opencl_nvidia -DVERSION_CHECK=%d",
+          cl_version_check); //-cl-nv-verbose
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
-  
-  cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice6overlap",&clStatus);
+
+  cl_kernel clKernel = clCreateKernel(
+      clProgram, "opencl_cutoff_potential_lattice6overlap", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
   /* setup OpenCL kernel parameters */
   blockDim[0] = 8;
   blockDim[1] = 2;
   blockDim[2] = 8;
-  gridDim[0] = xRegionDim*blockDim[0];
-  gridDim[1] = yRegionDim*blockDim[1];
-  gridDim[2] = 1*blockDim[2];
+  gridDim[0] = xRegionDim * blockDim[0];
+  gridDim[1] = yRegionDim * blockDim[1];
+  gridDim[2] = 1 * blockDim[2];
 
   /* allocate and initialize memory on OpenCL device */
   pb_SwitchToTimer(timers, pb_TimerID_COPY);
   if (verbose) {
     printf("Allocating %.2fMB on OpenCL device for potentials\n",
-           lnall * sizeof(float) / (double) (1024*1024));
+           lnall * sizeof(float) / (double)(1024 * 1024));
   }
-  
-  regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(ener_t),NULL,&clStatus);
+
+  regionZeroCl = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                                lnall * sizeof(ener_t), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
-  clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(ener_t));
+  clMemSet(clCommandQueue, regionZeroCl, 0, lnall * sizeof(ener_t));
 
   if (verbose) {
     printf("Allocating %.2fMB on OpenCL device for atom bins\n",
-           nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
+           nbins * BIN_DEPTH * sizeof(cl_float4) / (double)(1024 * 1024));
   }
 
-  binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
+  binBaseCl =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                     nbins * BIN_DEPTH * sizeof(cl_float4), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
- 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, binBaseCl, CL_TRUE, 0,
+                                  nbins * BIN_DEPTH * sizeof(cl_float4),
+                                  binBaseAddr, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  //Sub buffers are not supported in OpenCL v1.0
-  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;  
+  // Sub buffers are not supported in OpenCL v1.0
+  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
 
-  NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
+  NbrListLen =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, sizeof(int), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, NbrListLen, CL_TRUE, 0,
+                                  sizeof(int), &nbrlistlen, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
+  NbrList = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                           NBRLIST_MAXLEN * sizeof(xyz), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, NbrList, CL_TRUE, 0,
+                           nbrlistlen * sizeof(xyz), nbrlist, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  if (verbose) 
+  if (verbose)
     printf("\n");
 
   pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
-  clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
-  clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(int), &(binDim.x));
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), &(binDim.y));
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &binBaseCl);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &offset);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(float), &h);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(float), &cutoff2);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(float), &inv_cutoff2);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &regionZeroCl);
+  clStatus = clSetKernelArg(clKernel, 9, sizeof(cl_mem), &NbrListLen);
+  clStatus = clSetKernelArg(clKernel, 10, sizeof(cl_mem), &NbrList);
   CHECK_ERROR("clSetKernelArg")
 
   /*cl_command_queue cutoffstream;*/
@@ -411,21 +424,22 @@ int gpu_compute_cutoff_potential_lattice6overlap(
 
   /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
   pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
-  if(verbose)
+  if (verbose)
     printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
-  for (zRegionIndex = 0;  zRegionIndex < zRegionDim;  zRegionIndex++) {
+  for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
 #ifndef NO_DEBUG
     printf("  computing plane %d\n", zRegionIndex);
-#endif 
-    clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
+#endif
+    clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &zRegionIndex);
     CHECK_ERROR("clSetKernelArg")
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL,
+                                      gridDim, blockDim, 0, NULL, NULL);
     CHECK_ERROR("clEnqueueNDRangeKernel")
     clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
   }
 
-  /* 
+  /*
    * handle extra atoms on the CPU, concurrently with the GPU calculations
    */
 
@@ -434,10 +448,10 @@ int gpu_compute_cutoff_potential_lattice6overlap(
     printf("computing extra atoms on CPU\n");
     if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
       fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
-          "for extra atoms\n");
+                      "for extra atoms\n");
       return -1;
     }
-    if(verbose)
+    if (verbose)
       printf("\n");
   }
 
@@ -455,7 +469,9 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   /* copy result regions from OpenCL device */
   pb_SwitchToTimer(timers, pb_TimerID_COPY);
 
-  clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(ener_t),regionZeroAddr,0,NULL,NULL);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, regionZeroCl, CL_TRUE, 0,
+                                 lnall * sizeof(ener_t), regionZeroAddr, 0,
+                                 NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
   /* free OpenCL memory allocations */
@@ -470,28 +486,29 @@ int gpu_compute_cutoff_potential_lattice6overlap(
   clStatus = clReleaseCommandQueue(clCommandQueue);
   clStatus = clReleaseContext(clContext);
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   /*
    * transpose on CPU, updating, producing the final lattice
    */
   /* transpose regions back into lattice */
   pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-  for (k = 0;  k < nz;  k++) {
+  for (k = 0; k < nz; k++) {
     zRegionIndex = (k >> 3);
     zOffset = (k & 7);
 
-    for (j = 0;  j < ny;  j++) {
+    for (j = 0; j < ny; j++) {
       yRegionIndex = (j >> 3);
       yOffset = (j & 7);
 
-      for (i = 0;  i < nx;  i++) {
+      for (i = 0; i < nx; i++) {
         xRegionIndex = (i >> 3);
         xOffset = (i & 7);
 
-        thisRegion = regionZeroAddr
-          + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
-              + xRegionIndex) * REGION_SIZE;
+        thisRegion = regionZeroAddr +
+                     ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim +
+                      xRegionIndex) *
+                         REGION_SIZE;
 
         indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
         index = (k * ny + j) * nx + i;
@@ -499,7 +516,7 @@ int gpu_compute_cutoff_potential_lattice6overlap(
 #ifndef NEIGHBOR_COUNT
         lattice->lattice[index] += thisRegion[indexRegion];
 #else
-	neighbor_count += thisRegion[indexRegion];
+        neighbor_count += thisRegion[indexRegion];
 #endif
       }
     }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c
index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/excl.c
@@ -6,24 +6,22 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -35,8 +33,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -64,44 +62,45 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -112,42 +111,49 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h
index 513e65d64f72d2b9603a9d7e594417feffb324a5..2bd0ad46d3ac72073a85e97d3c7b51fc999fb006 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/macros.h
@@ -4,22 +4,24 @@
 #ifdef __DEVICE_EMULATION__
 #define DEBUG
 /* define which grid block and which thread to examine */
-#define BX  0
-#define BY  0
-#define TX  0
-#define TY  0
-#define TZ  0
-#define EMU(code) do { \
-  if (blockIdx.x==BX && blockIdx.y==BY && \
-      threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
-    code; \
-  } \
-} while (0)
-#define INT(n)    printf("%s = %d\n", #n, n)
-#define FLOAT(f)  printf("%s = %g\n", #f, (double)(f))
-#define INT3(n)   printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
-#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
-    (double)(f).y, (double)(f).z, (double)(f).w)
+#define BX 0
+#define BY 0
+#define TX 0
+#define TY 0
+#define TZ 0
+#define EMU(code)                                                              \
+  do {                                                                         \
+    if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX &&           \
+        threadIdx.y == TY && threadIdx.z == TZ) {                              \
+      code;                                                                    \
+    }                                                                          \
+  } while (0)
+#define INT(n) printf("%s = %d\n", #n, n)
+#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
+#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
+#define FLOAT4(f)                                                              \
+  printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y,               \
+         (double)(f).z, (double)(f).w)
 #else
 #define EMU(code)
 #define INT(n)
@@ -29,13 +31,12 @@
 #endif
 
 // report error from OpenCL
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Errorcode = %d\n", clStatus);  \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Errorcode = %d\n", clStatus);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #undef OPENCL11
@@ -48,7 +49,7 @@
  * reserve enough memory for 11^3 stencil of grid cells
  * this fits within 16K of memory
  */
-#define NBRLIST_DIM  11
+#define NBRLIST_DIM 11
 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
 
 /* Normally, we're summing electrostatic potential.  However, for
@@ -57,7 +58,7 @@
  */
 #undef NEIGHBOR_COUNT
 //#define NEIGHBOR_COUNT
- 
+
 #ifndef NEIGHBOR_COUNT
 typedef float ener_t;
 #else
@@ -70,16 +71,16 @@ typedef int ener_t;
  * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
  * should permit scheduling of up to 3 thread blocks per SM
  */
-#define BIN_DEPTH         8  /* max number of atoms per bin */
-#define BIN_SIZE         32  /* size of bin in floats */
-#define BIN_SHIFT         5  /* # of bits to shift for mul/div by BIN_SIZE */
-#define BIN_CACHE_MAXLEN 32  /* max number of atom bins to cache */
+#define BIN_DEPTH 8         /* max number of atoms per bin */
+#define BIN_SIZE 32         /* size of bin in floats */
+#define BIN_SHIFT 5         /* # of bits to shift for mul/div by BIN_SIZE */
+#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
 
-#define BIN_LENGTH      4.f  /* spatial length in Angstroms */
-#define BIN_INVLEN  (1.f / BIN_LENGTH)
+#define BIN_LENGTH 4.f /* spatial length in Angstroms */
+#define BIN_INVLEN (1.f / BIN_LENGTH)
 /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
  * so that bin fill should be 80% (for non-empty regions of space) */
 
-#define REGION_SIZE     512  /* number of floats in lattice region */
+#define REGION_SIZE 512 /* number of floats in lattice region */
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c
index 1e00f3e562d12e4bfd628a497eb56e03cfa9e2f4..bae7ca7339d41724520e1242a9b4d154c1cb073c 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/main.c
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
@@ -18,16 +18,15 @@
 
 #define ERRTOL 1e-4f
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
 int appenddata(const char *filename, int size, double time) {
   FILE *fp;
-  fp=fopen(filename, "a");
+  fp = fopen(filename, "a");
   if (fp == NULL) {
     printf("error appending to file %s..\n", filename);
     return -1;
@@ -37,23 +36,19 @@ int appenddata(const char *filename, int size, double time) {
   return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
   LatticeDim ret;
 
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
   ret.lo = lo;
   ret.h = h;
 
   return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
+Lattice *create_lattice(LatticeDim dim) {
   int size;
   Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
@@ -76,10 +71,7 @@ create_lattice(LatticeDim dim)
   return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
+void destroy_lattice(Lattice *lat) {
   if (lat) {
     free(lat->lattice);
     free(lat);
@@ -91,13 +83,13 @@ int main(int argc, char *argv[]) {
 
   LatticeDim lattice_dim;
   Lattice *gpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
   int n;
 
@@ -138,9 +130,10 @@ int main(int argc, char *argv[]) {
   printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
 
   printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
 
   lattice_dim = lattice_from_bounding_box(lo, hi, h);
   gpu_lattice = create_lattice(lattice_dim);
@@ -149,7 +142,8 @@ int main(int argc, char *argv[]) {
    *  OpenCL kernel, with overlapped GPU/CPU computation
    *  (Enter and exit the function with the COMPUTE timer active)
    */
-  if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) {
+  if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff,
+                                                   atom, 0)) {
     fprintf(stderr, "Computation failed\n");
     exit(1);
   }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h
index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/ocl.h
@@ -2,14 +2,13 @@
 #define __OCLH__
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c
index ac45761fb86afd598dfe24f2ecead5622cf00954..145f59cc065131db3461a04f9674a94afbf0cfb5 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.c
@@ -6,18 +6,16 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <inttypes.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -38,21 +36,21 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
-    //fprintf(outfile,"%f\n",tmp);
+    // fprintf(outfile,"%f\n",tmp);
   }
 
   /* Write the size of a lattice plane */
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
-    //fprintf(outfile,"%u\n",tmp);
+    // fprintf(outfile,"%u\n",tmp);
   }
 
   /* Write the plane of lattice data at z=0 and z = nz-1 */
@@ -60,11 +58,11 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
-//int i;
-   //for(i=0;i<100;i++)
-	//fprintf(outfile,"%f ",lattice_data[i]);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
+    // int i;
+    // for(i=0;i<100;i++)
+    // fprintf(outfile,"%f ",lattice_data[i]);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h
index 2ddd39227e6c043207897e923f9c7076452eff52..78a5f846e2feda2d1142ae0e1ea4f5edb4eb5ad6 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/opencl_nvidia/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h
index f5a60058612f4c0a953405e68a5013886bf60c1b..9adf659d371abc6b1bece5643e1faa0cc9a61251 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/atom.h
@@ -13,22 +13,22 @@
 extern "C" {
 #endif
 
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+typedef struct Atom_t {
+  float x, y, z, q;
+} Atom;
+
+typedef struct Atoms_t {
+  Atom *atoms;
+  int size;
+} Atoms;
+
+typedef struct Vec3_t {
+  float x, y, z;
+} Vec3;
+
+Atoms *read_atom_file(const char *fname);
+void free_atom(Atoms *atom);
+void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c
index f0fbdc79f25679053ae2b8fbcd997db178b5a4d4..475a4666e1a6366873dc49d18d311b76ef6cde38 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutcpu.c
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
@@ -18,15 +18,14 @@
 #undef DEBUG_PASS_RATE
 #define CHECK_CYLINDER_CPU
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int
+cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                     float cutoff,     /* cutoff distance */
+                                     Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -41,8 +40,8 @@ extern int cpu_compute_cutoff_potential_lattice(
   const float inv_a2 = 1.f / a2;
   float s;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -64,7 +63,7 @@ extern int cpu_compute_cutoff_potential_lattice(
   int ncell, nxcell, nycell, nzcell;
   int *first, *next;
   float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  Vec3 minext, maxext; /* Extent of atom bounding box */
   float xmin, ymin, zmin;
   float xmax, ymax, zmax;
 
@@ -77,44 +76,45 @@ extern int cpu_compute_cutoff_potential_lattice(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
+  next = (int *)malloc(natoms * sizeof(int));
+  for (n = 0; n < natoms; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < natoms; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -125,26 +125,33 @@ extern int cpu_compute_cutoff_potential_lattice(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 #ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
+          if (dydz2 >= a2)
+            continue;
 #endif
 
           dx = xstart;
@@ -152,27 +159,26 @@ extern int cpu_compute_cutoff_potential_lattice(
           pg = lattice->lattice + index;
 
 #if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
             s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+            e = q * (1 / sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
           }
 #else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
+            if (r2 >= a2) {
 #ifdef DEBUG_PASS_RATE
-		  fail_count++;
+              fail_count++;
 #endif
-		  continue;
-		}
+              continue;
+            }
 #ifdef DEBUG_PASS_RATE
-	    pass_count++;
+            pass_count++;
 #endif
             s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
+            e = q * (1 / sqrtf(r2)) * s * s;
             *pg += e;
           }
 #endif
@@ -180,7 +186,7 @@ extern int cpu_compute_cutoff_potential_lattice(
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
@@ -188,8 +194,8 @@ extern int cpu_compute_cutoff_potential_lattice(
 
   /* For debugging: print the number of times that the test passed/failed */
 #ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
+  printf("Pass :%lld\n", pass_count);
+  printf("Fail :%lld\n", fail_count);
 #endif
 
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h
index 477e5649b6ff4f58690fb80a017f8bcec86d135c..0f8b0ff96aaab0c84bfca49c112b717d568815b9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff.h
@@ -15,46 +15,44 @@ extern "C" {
 
 #define SHIFTED
 
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
-
-    /* Lowest corner of lattice */
-    Vec3 lo;
-
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
-
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
-
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
-
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
-
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
-
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
+/* A structure to record how points in 3D space map to array
+   elements.  Array element (z, y, x)
+   where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+   maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+*/
+typedef struct LatticeDim_t {
+  /* Number of lattice points in x, y, z dimensions */
+  int nx, ny, nz;
+
+  /* Lowest corner of lattice */
+  Vec3 lo;
+
+  /* Lattice spacing */
+  float h;
+} LatticeDim;
+
+/* An electric potential field sampled on a regular grid.  The
+   lattice size and grid point positions are specified by 'dim'.
+*/
+typedef struct Lattice_t {
+  LatticeDim dim;
+  float *lattice;
+} Lattice;
+
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+
+Lattice *create_lattice(LatticeDim dim);
+void destroy_lattice(Lattice *);
+
+int cpu_compute_cutoff_potential_lattice(Lattice *lattice, /* the lattice */
+                                         float cutoff,     /* cutoff distance */
+                                         Atoms *atoms      /* array of atoms */
+);
+
+int remove_exclusions(Lattice *lattice, /* the lattice */
+                      float exclcutoff, /* exclusion cutoff distance */
+                      Atoms *atom       /* array of atoms */
+);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c
index 7d3b880dafe70f877b596a1e1143489dcef19d2f..31b966e6f4cff21afee17e1ecd33103ec333d08c 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/cutoff6overlap.c
@@ -7,19 +7,19 @@
  ***************************************************************************/
 #include <CL/cl.h>
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
-#include "ocl.h"
 #include "macros.h"
+#include "ocl.h"
 
-//OpenCL v1.0
-//cl_int3 not defined
+// OpenCL v1.0
+// cl_int3 not defined
 #ifdef CL_VERSION_1_1
 #if CL_VERSION_1_1 != 1
 typedef cl_int4 cl_int3;
@@ -34,5 +34,4 @@ const cl_version_check = 0;
 // we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
 typedef cl_int4 xyz;
 
-//extern "C" int gpu_compute_cutoff_potential_lattice6overlap(
-
+// extern "C" int gpu_compute_cutoff_potential_lattice6overlap(
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c
index 1216854a9b1f76489015ca6cc9a43a8ca5c959df..10d9e5468be82086609ecbae0e557c30fc0633c9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/excl.c
@@ -6,24 +6,22 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
+#define CELLEN 4.f
+#define INV_CELLEN (1.f / CELLEN)
 
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
+extern int remove_exclusions(Lattice *lattice, /* the lattice */
+                             float cutoff,     /* exclusion cutoff distance */
+                             Atoms *atoms      /* array of atoms */
+) {
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
   int nz = lattice->dim.nz;
@@ -35,8 +33,8 @@ extern int remove_exclusions(
 
   const float a2 = cutoff * cutoff;
   const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
+  const int radius = (int)ceilf(cutoff * inv_gridspacing) - 1;
+  /* lattice point radius about each atom */
 
   int n;
   int i, j, k;
@@ -64,44 +62,45 @@ extern int remove_exclusions(
   get_atom_extent(&minext, &maxext, atoms);
 
   /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  nxcell = (int)floorf((maxext.x - minext.x) * inv_cellen) + 1;
+  nycell = (int)floorf((maxext.y - minext.y) * inv_cellen) + 1;
+  nzcell = (int)floorf((maxext.z - minext.z) * inv_cellen) + 1;
   ncell = nxcell * nycell * nzcell;
 
   /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
+  first = (int *)malloc(ncell * sizeof(int));
+  for (gindex = 0; gindex < ncell; gindex++) {
     first[gindex] = -1;
   }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
+  next = (int *)malloc(atoms->size * sizeof(int));
+  for (n = 0; n < atoms->size; n++) {
     next[n] = -1;
   }
 
   /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
+  for (n = 0; n < atoms->size; n++) {
+    if (0 == atom[n].q)
+      continue; /* skip any non-contributing atoms */
+    i = (int)floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int)floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int)floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k * nycell + j) * nxcell + i;
     next[n] = first[gindex];
     first[gindex] = n;
   }
 
   /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
+  for (gindex = 0; gindex < ncell; gindex++) {
+    for (n = first[gindex]; n != -1; n = next[n]) {
       x = atom[n].x - xlo;
       y = atom[n].y - ylo;
       z = atom[n].z - zlo;
       q = atom[n].q;
 
       /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
+      ic = (int)(x * inv_gridspacing);
+      jc = (int)(y * inv_gridspacing);
+      kc = (int)(z * inv_gridspacing);
 
       /* find extent of surrounding box of grid points */
       ia = ic - radius;
@@ -112,42 +111,49 @@ extern int remove_exclusions(
       kb = kc + radius + 1;
 
       /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
+      if (ia < 0)
+        ia = 0;
+      if (ib >= nx)
+        ib = nx - 1;
+      if (ja < 0)
+        ja = 0;
+      if (jb >= ny)
+        jb = ny - 1;
+      if (ka < 0)
+        ka = 0;
+      if (kb >= nz)
+        kb = nz - 1;
 
       /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
+      xstart = ia * gridspacing - x;
+      ystart = ja * gridspacing - y;
+      dz = ka * gridspacing - z;
+      for (k = ka; k <= kb; k++, dz += gridspacing) {
+        koff = k * ny;
+        dz2 = dz * dz;
 
         dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
+        for (j = ja; j <= jb; j++, dy += gridspacing) {
+          jkoff = (koff + j) * nx;
+          dydz2 = dy * dy + dz2;
 
           dx = xstart;
           index = jkoff + ia;
           pg = lattice->lattice + index;
 
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
+          for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
+            r2 = dx * dx + dydz2;
 
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
+            /* If atom and lattice point are too close, set the lattice value
+             * to zero */
+            if (r2 < a2)
+              *pg = 0;
           }
         }
       } /* end loop over surrounding grid points */
 
     } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
+  }   /* end loop over gridcells */
 
   /* free memory */
   free(next);
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h
index 2fcf28332ac0169dadbd3a3367c43399c651663b..9095917846a0cbcef7b00da03f6e7fcedaabdd84 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/macros.h
@@ -4,22 +4,24 @@
 #ifdef __DEVICE_EMULATION__
 #define DEBUG
 /* define which grid block and which thread to examine */
-#define BX  0
-#define BY  0
-#define TX  0
-#define TY  0
-#define TZ  0
-#define EMU(code) do { \
-  if (blockIdx.x==BX && blockIdx.y==BY && \
-      threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
-    code; \
-  } \
-} while (0)
-#define INT(n)    printf("%s = %d\n", #n, n)
-#define FLOAT(f)  printf("%s = %g\n", #f, (double)(f))
-#define INT3(n)   printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
-#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
-    (double)(f).y, (double)(f).z, (double)(f).w)
+#define BX 0
+#define BY 0
+#define TX 0
+#define TY 0
+#define TZ 0
+#define EMU(code)                                                              \
+  do {                                                                         \
+    if (blockIdx.x == BX && blockIdx.y == BY && threadIdx.x == TX &&           \
+        threadIdx.y == TY && threadIdx.z == TZ) {                              \
+      code;                                                                    \
+    }                                                                          \
+  } while (0)
+#define INT(n) printf("%s = %d\n", #n, n)
+#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
+#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
+#define FLOAT4(f)                                                              \
+  printf("%s = %g %g %g %g\n", #f, (double)(f).x, (double)(f).y,               \
+         (double)(f).z, (double)(f).w)
 #else
 #define EMU(code)
 #define INT(n)
@@ -29,12 +31,11 @@
 #endif
 
 // report error from OpenCL
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #undef OPENCL11
@@ -47,7 +48,7 @@
  * reserve enough memory for 11^3 stencil of grid cells
  * this fits within 16K of memory
  */
-#define NBRLIST_DIM  11
+#define NBRLIST_DIM 11
 #define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
 
 /* Normally, we're summing electrostatic potential.  However, for
@@ -56,7 +57,7 @@
  */
 #undef NEIGHBOR_COUNT
 //#define NEIGHBOR_COUNT
- 
+
 #ifndef NEIGHBOR_COUNT
 typedef float ener_t;
 #else
@@ -69,16 +70,16 @@ typedef int ener_t;
  * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
  * should permit scheduling of up to 3 thread blocks per SM
  */
-#define BIN_DEPTH         8  /* max number of atoms per bin */
-#define BIN_SIZE         32  /* size of bin in floats */
-#define BIN_SHIFT         5  /* # of bits to shift for mul/div by BIN_SIZE */
-#define BIN_CACHE_MAXLEN 32  /* max number of atom bins to cache */
+#define BIN_DEPTH 8         /* max number of atoms per bin */
+#define BIN_SIZE 32         /* size of bin in floats */
+#define BIN_SHIFT 5         /* # of bits to shift for mul/div by BIN_SIZE */
+#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
 
-#define BIN_LENGTH      4.f  /* spatial length in Angstroms */
-#define BIN_INVLEN  (1.f / BIN_LENGTH)
+#define BIN_LENGTH 4.f /* spatial length in Angstroms */
+#define BIN_INVLEN (1.f / BIN_LENGTH)
 /* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
  * so that bin fill should be 80% (for non-empty regions of space) */
 
-#define REGION_SIZE     512  /* number of floats in lattice region */
+#define REGION_SIZE 512 /* number of floats in lattice region */
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp
index c26621737c4c5979d863ccb7b42a8d4132f1b5c1..caf99a5b37daaa28af83cd058c138af1270feff9 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/main.cpp
@@ -6,16 +6,16 @@
  *cr
  ***************************************************************************/
 
+#include <math.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include <parboil.h>
 
 #include "atom.h"
 #include "cutoff.h"
-#include "output.h"
 #include "macros.h"
+#include "output.h"
 #include <visc.h>
 
 #define ERRTOL 1e-4f
@@ -23,1047 +23,985 @@
 #define NO_DEBUG
 //#undef  NO_DEBUG
 
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
+#define NOKERNELS 0
+#define CUTOFF1 1
+#define CUTOFF6 32
+#define CUTOFF6OVERLAP 64
+#define CUTOFFCPU 16384
 
-#define mul24(x,y) (x)*(y)
+#define mul24(x, y) (x) * (y)
 
 // =================== CUTCP Graph =============================
 
-
 // Define a type for a 3D coordinate.  Only 3 vector components are needed.
 // Using int4 type because int3 support is missing on some platforms.
-typedef struct __attribute__((__packed__)){
-    int x;
-    int y;
-    int z;
-    int w;
+typedef struct __attribute__((__packed__)) {
+  int x;
+  int y;
+  int z;
+  int w;
 } xyz;
 // May want to align these
-typedef struct __attribute__((__packed__)) __attribute__((aligned(16))){
-    float x;
-    float y;
-    float z;
-    float w;
+typedef struct __attribute__((__packed__)) __attribute__((aligned(16))) {
+  float x;
+  float y;
+  float z;
+  float w;
 } float4;
 
 extern float rsqrt(float x);
 
 void Allocation(long block) {
-    // Memory shared between threadblocks
-    size_t bytes_AtomBinCache = sizeof(float)*BIN_CACHE_MAXLEN * BIN_DEPTH * 4;
-    void* AtomBinCache  = __visc__malloc(bytes_AtomBinCache);
-
-    size_t bytes_myBinIndex = sizeof(xyz);
-    void* myBinIndex = __visc__malloc(bytes_myBinIndex);
-    __visc__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex, bytes_myBinIndex);
+  // Memory shared between threadblocks
+  size_t bytes_AtomBinCache = sizeof(float) * BIN_CACHE_MAXLEN * BIN_DEPTH * 4;
+  void *AtomBinCache = __visc__malloc(bytes_AtomBinCache);
+
+  size_t bytes_myBinIndex = sizeof(xyz);
+  void *myBinIndex = __visc__malloc(bytes_myBinIndex);
+  __visc__return(4, AtomBinCache, bytes_AtomBinCache, myBinIndex,
+                 bytes_myBinIndex);
 }
 
-void CUTCPLeaf(
-    int binDim_x,
-    int binDim_y,
-    float *binBaseAddr, size_t bytes_binBaseAddr,
-    int offset,
-    float h,                /* lattice spacing */
-    float cutoff2,          /* square of cutoff distance */
-    float inv_cutoff2,
-    ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */
-    int zRegionIndex,
-    // constant memory arguments the next two
-    int *NbrListLen, size_t bytes_NbrListLen,
-    xyz *NbrList, size_t bytes_NbrList,
-    // local memory args
-    float* AtomBinCache, size_t bytes_AtomBinCache,
-    int* myBinIndex, size_t bytes_myBinIndex
-)
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-    int lz = __visc__getNodeInstanceID_z(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-    int dimx = __visc__getNumNodeInstances_x(thisNode);
-    int dimy = __visc__getNumNodeInstances_y(thisNode);
-    int gdimx = __visc__getNumNodeInstances_x(parentNode);
-    int gdimy = __visc__getNumNodeInstances_y(parentNode);
-
-    float* binZeroAddr = binBaseAddr + 4*offset;
-
-    /*__local float AtomBinCache[BIN_CACHE_MAXLEN * BIN_DEPTH * 4];*/
-    ener_t *myRegionAddr;
-    /*__local xyz myBinIndex;*/
-
-    const int xRegionIndex = gx;
-    const int yRegionIndex = gy;
-
-    /* thread id */
-    const int tid = (lz*dimy+ly)*dimx+lx;
-
-    /* neighbor index */
-    int nbrid;
-
-    /* this is the start of the sub-region indexed by tid */
-    myRegionAddr = regionZeroAddr + ((zRegionIndex*gdimy
-                                      + yRegionIndex)*gdimx + xRegionIndex)*REGION_SIZE;
-
-    /* spatial coordinate of this lattice point */
-    float x = (8 * xRegionIndex + lx) * h;
-    float y = (8 * yRegionIndex + ly) * h;
-    float z = (8 * zRegionIndex + lz) * h;
-
-    int totalbins = 0;
-    int numbins;
-
-    /* bin number determined by center of region */
-    myBinIndex[0] = (int) floor((8 * xRegionIndex + 4) * h * BIN_INVLEN);
-    myBinIndex[1] = (int) floor((8 * yRegionIndex + 4) * h * BIN_INVLEN);
-    myBinIndex[2] = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
-
-    /* first neighbor in list for me to cache */
-    nbrid = (tid >> 4);
-
-    numbins = BIN_CACHE_MAXLEN;
+void CUTCPLeaf(int binDim_x, int binDim_y, float *binBaseAddr,
+               size_t bytes_binBaseAddr, int offset,
+               float h,       /* lattice spacing */
+               float cutoff2, /* square of cutoff distance */
+               float inv_cutoff2, ener_t *regionZeroAddr,
+               size_t bytes_regionZeroAddr, /* address of lattice regions
+                                               starting at origin */
+               int zRegionIndex,
+               // constant memory arguments the next two
+               int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList,
+               size_t bytes_NbrList,
+               // local memory args
+               float *AtomBinCache, size_t bytes_AtomBinCache, int *myBinIndex,
+               size_t bytes_myBinIndex) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+                     regionZeroAddr);
+
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
+  int lx = __visc__getNodeInstanceID_x(thisNode);
+  int ly = __visc__getNodeInstanceID_y(thisNode);
+  int lz = __visc__getNodeInstanceID_z(thisNode);
+  int gx = __visc__getNodeInstanceID_x(parentNode);
+  int gy = __visc__getNodeInstanceID_y(parentNode);
+  int dimx = __visc__getNumNodeInstances_x(thisNode);
+  int dimy = __visc__getNumNodeInstances_y(thisNode);
+  int gdimx = __visc__getNumNodeInstances_x(parentNode);
+  int gdimy = __visc__getNumNodeInstances_y(parentNode);
+
+  float *binZeroAddr = binBaseAddr + 4 * offset;
+
+  /*__local float AtomBinCache[BIN_CACHE_MAXLEN * BIN_DEPTH * 4];*/
+  ener_t *myRegionAddr;
+  /*__local xyz myBinIndex;*/
+
+  const int xRegionIndex = gx;
+  const int yRegionIndex = gy;
+
+  /* thread id */
+  const int tid = (lz * dimy + ly) * dimx + lx;
+
+  /* neighbor index */
+  int nbrid;
+
+  /* this is the start of the sub-region indexed by tid */
+  myRegionAddr =
+      regionZeroAddr +
+      ((zRegionIndex * gdimy + yRegionIndex) * gdimx + xRegionIndex) *
+          REGION_SIZE;
+
+  /* spatial coordinate of this lattice point */
+  float x = (8 * xRegionIndex + lx) * h;
+  float y = (8 * yRegionIndex + ly) * h;
+  float z = (8 * zRegionIndex + lz) * h;
+
+  int totalbins = 0;
+  int numbins;
+
+  /* bin number determined by center of region */
+  myBinIndex[0] = (int)floor((8 * xRegionIndex + 4) * h * BIN_INVLEN);
+  myBinIndex[1] = (int)floor((8 * yRegionIndex + 4) * h * BIN_INVLEN);
+  myBinIndex[2] = (int)floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
+
+  /* first neighbor in list for me to cache */
+  nbrid = (tid >> 4);
+
+  numbins = BIN_CACHE_MAXLEN;
 
 #ifndef NEIGHBOR_COUNT
-    ener_t energy0 = 0.f;
-    ener_t energy1 = 0.f;
-    ener_t energy2 = 0.f;
-    ener_t energy3 = 0.f;
+  ener_t energy0 = 0.f;
+  ener_t energy1 = 0.f;
+  ener_t energy2 = 0.f;
+  ener_t energy3 = 0.f;
 #else
-    ener_t energy0 = 0, energy1 = 0, energy2 = 0, energy3 = 0;
+  ener_t energy0 = 0, energy1 = 0, energy2 = 0, energy3 = 0;
 #endif
 
-    for (totalbins = 0;  totalbins < *NbrListLen;  totalbins += numbins) {
+  for (totalbins = 0; totalbins < *NbrListLen; totalbins += numbins) {
 
-        int bincnt;
+    int bincnt;
 
-        /* start of where to write in shared memory */
-        int startoff = BIN_SIZE * (tid >> 4);
+    /* start of where to write in shared memory */
+    int startoff = BIN_SIZE * (tid >> 4);
 
-        /* each half-warp to cache up to 4 atom bins */
-        for (bincnt = 0;  bincnt < 4 && nbrid < *NbrListLen;  bincnt++, nbrid += 8) {
+    /* each half-warp to cache up to 4 atom bins */
+    for (bincnt = 0; bincnt < 4 && nbrid < *NbrListLen; bincnt++, nbrid += 8) {
 
-            int i = myBinIndex[0] + NbrList[nbrid].x;
-            int j = myBinIndex[1] + NbrList[nbrid].y;
-            int k = myBinIndex[2] + NbrList[nbrid].z;
+      int i = myBinIndex[0] + NbrList[nbrid].x;
+      int j = myBinIndex[1] + NbrList[nbrid].y;
+      int k = myBinIndex[2] + NbrList[nbrid].z;
 
-            /* determine global memory location of atom bin */
-            float *p_global = (( float *) binZeroAddr)
-                              + (((mul24(k, binDim_y) + j)*binDim_x + i) << BIN_SHIFT);
+      /* determine global memory location of atom bin */
+      float *p_global =
+          ((float *)binZeroAddr) +
+          (((mul24(k, binDim_y) + j) * binDim_x + i) << BIN_SHIFT);
 
-            /* coalesced read from global memory -
-             * retain same ordering in shared memory for now */
-            int binIndex = startoff + (bincnt << (3 + BIN_SHIFT));
-            int tidmask = tid & 15;
+      /* coalesced read from global memory -
+       * retain same ordering in shared memory for now */
+      int binIndex = startoff + (bincnt << (3 + BIN_SHIFT));
+      int tidmask = tid & 15;
 
-            AtomBinCache[binIndex + tidmask   ] = p_global[tidmask   ];
-            AtomBinCache[binIndex + tidmask+16] = p_global[tidmask+16];
-        }
+      AtomBinCache[binIndex + tidmask] = p_global[tidmask];
+      AtomBinCache[binIndex + tidmask + 16] = p_global[tidmask + 16];
+    }
 
-        __visc__barrier();
-        /* no warp divergence */
-        if (totalbins + BIN_CACHE_MAXLEN > *NbrListLen) {
-            numbins = *NbrListLen - totalbins;
-        }
+    __visc__barrier();
+    /* no warp divergence */
+    if (totalbins + BIN_CACHE_MAXLEN > *NbrListLen) {
+      numbins = *NbrListLen - totalbins;
+    }
 
-        int stopbin = (numbins << BIN_SHIFT);
-        for (bincnt = 0; bincnt < stopbin; bincnt+=BIN_SIZE) {
-            int i;
+    int stopbin = (numbins << BIN_SHIFT);
+    for (bincnt = 0; bincnt < stopbin; bincnt += BIN_SIZE) {
+      int i;
 
-            for (i = 0;  i < BIN_DEPTH;  i++) {
+      for (i = 0; i < BIN_DEPTH; i++) {
 
-                int off = bincnt + (i<<2);
+        int off = bincnt + (i << 2);
 
-                float aq = AtomBinCache[off + 3];
-                if (0.f == aq)
-                    break;  /* no more atoms in bin */
+        float aq = AtomBinCache[off + 3];
+        if (0.f == aq)
+          break; /* no more atoms in bin */
 
-                float dx = AtomBinCache[off    ] - x;
-                float dz = AtomBinCache[off + 2] - z;
-                float dxdz2 = dx*dx + dz*dz;
-                float dy = AtomBinCache[off + 1] - y;
-                float r2 = dy*dy + dxdz2;
+        float dx = AtomBinCache[off] - x;
+        float dz = AtomBinCache[off + 2] - z;
+        float dxdz2 = dx * dx + dz * dz;
+        float dy = AtomBinCache[off + 1] - y;
+        float r2 = dy * dy + dxdz2;
 
 #ifndef NEIGHBOR_COUNT
-                if (r2 < cutoff2)
-                {
-                    float s = (1.f - r2 * inv_cutoff2);
-                    energy0 += aq * rsqrt(r2) * s * s;
-                    //energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s;
-                }
+        if (r2 < cutoff2) {
+          float s = (1.f - r2 * inv_cutoff2);
+          energy0 += aq * rsqrt(r2) * s * s;
+          // energy0 += aq * (1.0/__visc__sqrt(r2)) * s * s;
+        }
 #else
-                energy0 += (r2 < cutoff2);
+        energy0 += (r2 < cutoff2);
 #endif
-                dy -= 2.0f*h;
-                r2 = dy*dy + dxdz2;
+        dy -= 2.0f * h;
+        r2 = dy * dy + dxdz2;
 
 #ifndef NEIGHBOR_COUNT
-                if (r2 < cutoff2)
-                {
-                    float s = (1.f - r2 * inv_cutoff2);
-                    energy1 += aq * rsqrt(r2) * s * s;
-                    //energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s;
-                }
+        if (r2 < cutoff2) {
+          float s = (1.f - r2 * inv_cutoff2);
+          energy1 += aq * rsqrt(r2) * s * s;
+          // energy1 += aq * (1.0/__visc__sqrt(r2)) * s * s;
+        }
 #else
-                energy1 += (r2 < cutoff2);
+        energy1 += (r2 < cutoff2);
 #endif
-                dy -= 2.0f*h;
-                r2 = dy*dy + dxdz2;
+        dy -= 2.0f * h;
+        r2 = dy * dy + dxdz2;
 #ifndef NEIGHBOR_COUNT
-                if (r2 < cutoff2)
-                {
-                    float s = (1.f - r2 * inv_cutoff2);
-                    energy2 += aq * rsqrt(r2) * s * s;
-                    //energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s;
-                }
+        if (r2 < cutoff2) {
+          float s = (1.f - r2 * inv_cutoff2);
+          energy2 += aq * rsqrt(r2) * s * s;
+          // energy2 += aq * (1.0/__visc__sqrt(r2)) * s * s;
+        }
 #else
-                energy2 += (r2 < cutoff2);
+        energy2 += (r2 < cutoff2);
 #endif
-                dy -= 2.0f*h;
-                r2 = dy*dy + dxdz2;
+        dy -= 2.0f * h;
+        r2 = dy * dy + dxdz2;
 #ifndef NEIGHBOR_COUNT
-                if (r2 < cutoff2)
-                {
-                    float s = (1.f - r2 * inv_cutoff2);
-                    energy3 += aq * rsqrt(r2) * s * s;
-                    //energy3 += aq * (1.0/rsqrt(r2)) * s * s;
-                }
+        if (r2 < cutoff2) {
+          float s = (1.f - r2 * inv_cutoff2);
+          energy3 += aq * rsqrt(r2) * s * s;
+          // energy3 += aq * (1.0/rsqrt(r2)) * s * s;
+        }
 #else
-                energy3 += (r2 < cutoff2);
+        energy3 += (r2 < cutoff2);
 #endif
-            } /* end loop over atoms in bin */
-        } /* end loop over cached atom bins */
-        __visc__barrier();
-    } /* end loop over neighbor list */
-
-    /* store into global memory */
-    myRegionAddr[(tid>>4)*64 + (tid&15)     ] = energy0;
-    myRegionAddr[(tid>>4)*64 + (tid&15) + 16] = energy1;
-    myRegionAddr[(tid>>4)*64 + (tid&15) + 32] = energy2;
-    myRegionAddr[(tid>>4)*64 + (tid&15) + 48] = energy3;
+      } /* end loop over atoms in bin */
+    }   /* end loop over cached atom bins */
+    __visc__barrier();
+  } /* end loop over neighbor list */
+
+  /* store into global memory */
+  myRegionAddr[(tid >> 4) * 64 + (tid & 15)] = energy0;
+  myRegionAddr[(tid >> 4) * 64 + (tid & 15) + 16] = energy1;
+  myRegionAddr[(tid >> 4) * 64 + (tid & 15) + 32] = energy2;
+  myRegionAddr[(tid >> 4) * 64 + (tid & 15) + 48] = energy3;
 }
 
-void BlockingCUTCP(
-    int binDim_x,
-    int binDim_y,
-    float4 *binBaseAddr, size_t bytes_binBaseAddr,
-    int offset,
-    float h,                /* lattice spacing */
-    float cutoff2,          /* square of cutoff distance */
-    float inv_cutoff2,
-    ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */
-    int zRegionIndex,
-    // constant memory arguments the next two
-    int *NbrListLen, size_t bytes_NbrListLen,
-    xyz *NbrList, size_t bytes_NbrList,
-    long blockx,
-    long blocky,
-    long blockz
-) {
-
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr);
-
-    void* AllocationNode = __visc__createNodeND(0, Allocation);
-    void* CUTCPLeafNode = __visc__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz);
-
-    // Bind Inputs
-    __visc__bindIn(AllocationNode, 15, 0, 0); // Bind blockx
-    __visc__bindIn(CUTCPLeafNode, 0, 0, 0); // Bind binDim_x
-    __visc__bindIn(CUTCPLeafNode, 1, 1, 0); // Bind binDim_y
-    __visc__bindIn(CUTCPLeafNode, 2, 2, 0); // Bind binBaseAddr
-    __visc__bindIn(CUTCPLeafNode, 3, 3, 0); // Bind bytes_binBaseAddr
-    __visc__bindIn(CUTCPLeafNode, 4, 4, 0); // Bind offset
-    __visc__bindIn(CUTCPLeafNode, 5, 5, 0); // Bind h
-    __visc__bindIn(CUTCPLeafNode, 6, 6, 0); // Bind cutoff2
-    __visc__bindIn(CUTCPLeafNode, 7, 7, 0); // Bind inv_cutoff2
-    __visc__bindIn(CUTCPLeafNode, 8, 8, 0); // Bind regionZeroAddr
-    __visc__bindIn(CUTCPLeafNode, 9, 9, 0); // Bind bytes_regionZeroAddr
-    __visc__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex
-    __visc__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen
-    __visc__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen
-    __visc__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList
-    __visc__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList
-
-    // Create Edges
-    __visc__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache
-    __visc__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16, 0); // Edge bytes_AtomBinCache
-    __visc__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex
-    __visc__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18, 0); // Edge bytes_myBinIndex
-
+void BlockingCUTCP(int binDim_x, int binDim_y, float4 *binBaseAddr,
+                   size_t bytes_binBaseAddr, int offset,
+                   float h,       /* lattice spacing */
+                   float cutoff2, /* square of cutoff distance */
+                   float inv_cutoff2, ener_t *regionZeroAddr,
+                   size_t bytes_regionZeroAddr, /* address of lattice regions
+                                                   starting at origin */
+                   int zRegionIndex,
+                   // constant memory arguments the next two
+                   int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList,
+                   size_t bytes_NbrList, long blockx, long blocky,
+                   long blockz) {
+
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+                     regionZeroAddr);
+
+  void *AllocationNode = __visc__createNodeND(0, Allocation);
+  void *CUTCPLeafNode =
+      __visc__createNodeND(3, CUTCPLeaf, blockx, blocky, blockz);
+
+  // Bind Inputs
+  __visc__bindIn(AllocationNode, 15, 0, 0); // Bind blockx
+  __visc__bindIn(CUTCPLeafNode, 0, 0, 0);   // Bind binDim_x
+  __visc__bindIn(CUTCPLeafNode, 1, 1, 0);   // Bind binDim_y
+  __visc__bindIn(CUTCPLeafNode, 2, 2, 0);   // Bind binBaseAddr
+  __visc__bindIn(CUTCPLeafNode, 3, 3, 0);   // Bind bytes_binBaseAddr
+  __visc__bindIn(CUTCPLeafNode, 4, 4, 0);   // Bind offset
+  __visc__bindIn(CUTCPLeafNode, 5, 5, 0);   // Bind h
+  __visc__bindIn(CUTCPLeafNode, 6, 6, 0);   // Bind cutoff2
+  __visc__bindIn(CUTCPLeafNode, 7, 7, 0);   // Bind inv_cutoff2
+  __visc__bindIn(CUTCPLeafNode, 8, 8, 0);   // Bind regionZeroAddr
+  __visc__bindIn(CUTCPLeafNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
+  __visc__bindIn(CUTCPLeafNode, 10, 10, 0); // Bind zRegionIndex
+  __visc__bindIn(CUTCPLeafNode, 11, 11, 0); // Bind NbrListLen
+  __visc__bindIn(CUTCPLeafNode, 12, 12, 0); // Bind bytes_NbrListLen
+  __visc__bindIn(CUTCPLeafNode, 13, 13, 0); // Bind NbrList
+  __visc__bindIn(CUTCPLeafNode, 14, 14, 0); // Bind bytes_NbrList
+
+  // Create Edges
+  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 0, 15, 0); // Edge AtomBinCache
+  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 1, 16,
+               0); // Edge bytes_AtomBinCache
+  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 2, 17, 0); // Edge myBinIndex
+  __visc__edge(AllocationNode, CUTCPLeafNode, 1, 3, 18,
+               0); // Edge bytes_myBinIndex
 }
 
 typedef struct __attribute__((__packed__)) {
-    int binDim_x;
-    int binDim_y;
-    float4 *binBaseAddr;
-    size_t bytes_binBaseAddr;
-    int offset;
-    float h;                /* lattice spacing */
-    float cutoff2;          /* square of cutoff distance */
-    float inv_cutoff2;
-    ener_t *regionZeroAddr;
-    size_t bytes_regionZeroAddr; /* address of lattice regions starting at origin */
-    int zRegionIndex;
-    // constant memory arguments the next two
-    int *NbrListLen;
-    size_t bytes_NbrListLen;
-    xyz *NbrList;
-    size_t bytes_NbrList;
-    long blockx;
-    long blocky;
-    long blockz;
-    long gridx;
-    long gridy;
-    long gridz;
+  int binDim_x;
+  int binDim_y;
+  float4 *binBaseAddr;
+  size_t bytes_binBaseAddr;
+  int offset;
+  float h;       /* lattice spacing */
+  float cutoff2; /* square of cutoff distance */
+  float inv_cutoff2;
+  ener_t *regionZeroAddr;
+  size_t
+      bytes_regionZeroAddr; /* address of lattice regions starting at origin */
+  int zRegionIndex;
+  // constant memory arguments the next two
+  int *NbrListLen;
+  size_t bytes_NbrListLen;
+  xyz *NbrList;
+  size_t bytes_NbrList;
+  long blockx;
+  long blocky;
+  long blockz;
+  long gridx;
+  long gridy;
+  long gridz;
 } RootIn;
 
-void packData(
-    RootIn* args,
-    int binDim_x,
-    int binDim_y,
-    float4 *binBaseAddr, size_t bytes_binBaseAddr,
-    int offset,
-    float h,                /* lattice spacing */
-    float cutoff2,          /* square of cutoff distance */
-    float inv_cutoff2,
-    ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */
-    int zRegionIndex,
-    // constant memory arguments the next two
-    int *NbrListLen, size_t bytes_NbrListLen,
-    xyz *NbrList, size_t bytes_NbrList,
-    long blockx,
-    long blocky,
-    long blockz,
-    long gridx,
-    long gridy,
-    long gridz
-) {
-    args->binDim_x = binDim_x;
-    args->binDim_y = binDim_y;
-    args->binBaseAddr = binBaseAddr;
-    args->bytes_binBaseAddr = bytes_binBaseAddr;
-    args->offset = offset;
-    args->h = h;                /* lattice spacing */
-    args->cutoff2 = cutoff2;          /* square of cutoff distance */
-    args->inv_cutoff2 = inv_cutoff2;
-    args->regionZeroAddr = regionZeroAddr;
-    args->bytes_regionZeroAddr = bytes_regionZeroAddr; /* address of lattice regions starting at origin */
-    args->zRegionIndex = zRegionIndex;
-    // constant memory arguments the next two
-    args->NbrListLen = NbrListLen;
-    args->bytes_NbrListLen = bytes_NbrListLen;
-    args->NbrList = NbrList;
-    args->bytes_NbrList = bytes_NbrList;
-    args->blockx = blockx;
-    args->blocky = blocky;
-    args->blockz = blockz;
-    args->gridx = gridx;
-    args->gridy = gridy;
-    args->gridz = gridz;
-
+void packData(RootIn *args, int binDim_x, int binDim_y, float4 *binBaseAddr,
+              size_t bytes_binBaseAddr, int offset,
+              float h,       /* lattice spacing */
+              float cutoff2, /* square of cutoff distance */
+              float inv_cutoff2, ener_t *regionZeroAddr,
+              size_t bytes_regionZeroAddr, /* address of lattice regions
+                                              starting at origin */
+              int zRegionIndex,
+              // constant memory arguments the next two
+              int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList,
+              size_t bytes_NbrList, long blockx, long blocky, long blockz,
+              long gridx, long gridy, long gridz) {
+  args->binDim_x = binDim_x;
+  args->binDim_y = binDim_y;
+  args->binBaseAddr = binBaseAddr;
+  args->bytes_binBaseAddr = bytes_binBaseAddr;
+  args->offset = offset;
+  args->h = h;             /* lattice spacing */
+  args->cutoff2 = cutoff2; /* square of cutoff distance */
+  args->inv_cutoff2 = inv_cutoff2;
+  args->regionZeroAddr = regionZeroAddr;
+  args->bytes_regionZeroAddr =
+      bytes_regionZeroAddr; /* address of lattice regions starting at origin */
+  args->zRegionIndex = zRegionIndex;
+  // constant memory arguments the next two
+  args->NbrListLen = NbrListLen;
+  args->bytes_NbrListLen = bytes_NbrListLen;
+  args->NbrList = NbrList;
+  args->bytes_NbrList = bytes_NbrList;
+  args->blockx = blockx;
+  args->blocky = blocky;
+  args->blockz = blockz;
+  args->gridx = gridx;
+  args->gridy = gridy;
+  args->gridz = gridz;
 }
 
-void CUTCPRoot(
-    int binDim_x,
-    int binDim_y,
-    float4 *binBaseAddr, size_t bytes_binBaseAddr,
-    int offset,
-    float h,                /* lattice spacing */
-    float cutoff2,          /* square of cutoff distance */
-    float inv_cutoff2,
-    ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */
-    int zRegionIndex,
-    // constant memory arguments the next two
-    int *NbrListLen, size_t bytes_NbrListLen,
-    xyz *NbrList, size_t bytes_NbrList,
-    long blockx,
-    long blocky,
-    long blockz,
-    long gridx,
-    long gridy,
-    long gridz
-) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr);
-
-    void* BlockingCUTCPNode = __visc__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz);
-
-    // Bind Inputs
-    __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x
-    __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y
-    __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr
-    __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr
-    __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset
-    __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h
-    __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2
-    __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2
-    __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr
-    __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr
-    __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
-    __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
-    __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
-    __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
-    __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
-    __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
-    __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
-    __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
-
+void CUTCPRoot(int binDim_x, int binDim_y, float4 *binBaseAddr,
+               size_t bytes_binBaseAddr, int offset,
+               float h,       /* lattice spacing */
+               float cutoff2, /* square of cutoff distance */
+               float inv_cutoff2, ener_t *regionZeroAddr,
+               size_t bytes_regionZeroAddr, /* address of lattice regions
+                                               starting at origin */
+               int zRegionIndex,
+               // constant memory arguments the next two
+               int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList,
+               size_t bytes_NbrList, long blockx, long blocky, long blockz,
+               long gridx, long gridy, long gridz) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+                     regionZeroAddr);
+
+  void *BlockingCUTCPNode =
+      __visc__createNodeND(3, BlockingCUTCP, gridx, gridy, gridz);
+
+  // Bind Inputs
+  __visc__bindIn(BlockingCUTCPNode, 0, 0, 0);   // Bind binDim_x
+  __visc__bindIn(BlockingCUTCPNode, 1, 1, 0);   // Bind binDim_y
+  __visc__bindIn(BlockingCUTCPNode, 2, 2, 0);   // Bind binBaseAddr
+  __visc__bindIn(BlockingCUTCPNode, 3, 3, 0);   // Bind bytes_binBaseAddr
+  __visc__bindIn(BlockingCUTCPNode, 4, 4, 0);   // Bind offset
+  __visc__bindIn(BlockingCUTCPNode, 5, 5, 0);   // Bind h
+  __visc__bindIn(BlockingCUTCPNode, 6, 6, 0);   // Bind cutoff2
+  __visc__bindIn(BlockingCUTCPNode, 7, 7, 0);   // Bind inv_cutoff2
+  __visc__bindIn(BlockingCUTCPNode, 8, 8, 0);   // Bind regionZeroAddr
+  __visc__bindIn(BlockingCUTCPNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
+  __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
+  __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
+  __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
+  __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
+  __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
+  __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
+  __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
+  __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
 }
 
-
-void CUTCPWrapper(
-    int binDim_x,
-    int binDim_y,
-    float4 *binBaseAddr, size_t bytes_binBaseAddr,
-    int offset,
-    float h,                /* lattice spacing */
-    float cutoff2,          /* square of cutoff distance */
-    float inv_cutoff2,
-    ener_t *regionZeroAddr, size_t bytes_regionZeroAddr, /* address of lattice regions starting at origin */
-    int zRegionIndex,
-    // constant memory arguments the next two
-    int *NbrListLen, size_t bytes_NbrListLen,
-    xyz *NbrList, size_t bytes_NbrList,
-    long blockx,
-    long blocky,
-    long blockz,
-    long gridx,
-    long gridy,
-    long gridz
-) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1, regionZeroAddr);
-
-    void* BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot);
-
-    // Bind Inputs
-    __visc__bindIn(BlockingCUTCPNode, 0, 0, 0); // Bind binDim_x
-    __visc__bindIn(BlockingCUTCPNode, 1, 1, 0); // Bind binDim_y
-    __visc__bindIn(BlockingCUTCPNode, 2, 2, 0); // Bind binBaseAddr
-    __visc__bindIn(BlockingCUTCPNode, 3, 3, 0); // Bind bytes_binBaseAddr
-    __visc__bindIn(BlockingCUTCPNode, 4, 4, 0); // Bind offset
-    __visc__bindIn(BlockingCUTCPNode, 5, 5, 0); // Bind h
-    __visc__bindIn(BlockingCUTCPNode, 6, 6, 0); // Bind cutoff2
-    __visc__bindIn(BlockingCUTCPNode, 7, 7, 0); // Bind inv_cutoff2
-    __visc__bindIn(BlockingCUTCPNode, 8, 8, 0); // Bind regionZeroAddr
-    __visc__bindIn(BlockingCUTCPNode, 9, 9, 0); // Bind bytes_regionZeroAddr
-    __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
-    __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
-    __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
-    __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
-    __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
-    __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
-    __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
-    __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
-    __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx
-    __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy
-    __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz
+void CUTCPWrapper(int binDim_x, int binDim_y, float4 *binBaseAddr,
+                  size_t bytes_binBaseAddr, int offset,
+                  float h,       /* lattice spacing */
+                  float cutoff2, /* square of cutoff distance */
+                  float inv_cutoff2, ener_t *regionZeroAddr,
+                  size_t bytes_regionZeroAddr, /* address of lattice regions
+                                                  starting at origin */
+                  int zRegionIndex,
+                  // constant memory arguments the next two
+                  int *NbrListLen, size_t bytes_NbrListLen, xyz *NbrList,
+                  size_t bytes_NbrList, long blockx, long blocky, long blockz,
+                  long gridx, long gridy, long gridz) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(4, binBaseAddr, regionZeroAddr, NbrListLen, NbrList, 1,
+                     regionZeroAddr);
+
+  void *BlockingCUTCPNode = __visc__createNodeND(0, CUTCPRoot);
+
+  // Bind Inputs
+  __visc__bindIn(BlockingCUTCPNode, 0, 0, 0);   // Bind binDim_x
+  __visc__bindIn(BlockingCUTCPNode, 1, 1, 0);   // Bind binDim_y
+  __visc__bindIn(BlockingCUTCPNode, 2, 2, 0);   // Bind binBaseAddr
+  __visc__bindIn(BlockingCUTCPNode, 3, 3, 0);   // Bind bytes_binBaseAddr
+  __visc__bindIn(BlockingCUTCPNode, 4, 4, 0);   // Bind offset
+  __visc__bindIn(BlockingCUTCPNode, 5, 5, 0);   // Bind h
+  __visc__bindIn(BlockingCUTCPNode, 6, 6, 0);   // Bind cutoff2
+  __visc__bindIn(BlockingCUTCPNode, 7, 7, 0);   // Bind inv_cutoff2
+  __visc__bindIn(BlockingCUTCPNode, 8, 8, 0);   // Bind regionZeroAddr
+  __visc__bindIn(BlockingCUTCPNode, 9, 9, 0);   // Bind bytes_regionZeroAddr
+  __visc__bindIn(BlockingCUTCPNode, 10, 10, 0); // Bind zRegionIndex
+  __visc__bindIn(BlockingCUTCPNode, 11, 11, 0); // Bind NbrListLen
+  __visc__bindIn(BlockingCUTCPNode, 12, 12, 0); // Bind bytes_NbrListLen
+  __visc__bindIn(BlockingCUTCPNode, 13, 13, 0); // Bind NbrList
+  __visc__bindIn(BlockingCUTCPNode, 14, 14, 0); // Bind bytes_NbrList
+  __visc__bindIn(BlockingCUTCPNode, 15, 15, 0); // Bind blockx
+  __visc__bindIn(BlockingCUTCPNode, 16, 16, 0); // Bind blocky
+  __visc__bindIn(BlockingCUTCPNode, 17, 17, 0); // Bind blockz
+  __visc__bindIn(BlockingCUTCPNode, 18, 18, 0); // Bind gridx
+  __visc__bindIn(BlockingCUTCPNode, 19, 19, 0); // Bind gridy
+  __visc__bindIn(BlockingCUTCPNode, 20, 20, 0); // Bind gridz
 }
 
 // ==================== Host Code ==============================
 
 int gpu_compute_cutoff_potential_lattice6overlap(
-    struct pb_TimerSet *timers,        /* for measuring execution time */
-    Lattice *lattice,
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms,                      /* array of atoms */
-    int verbose                        /* print info/debug messages */
+    struct pb_TimerSet *timers,     /* for measuring execution time */
+    Lattice *lattice, float cutoff, /* cutoff distance */
+    Atoms *atoms,                   /* array of atoms */
+    int verbose                     /* print info/debug messages */
 );
 
 int appenddata(const char *filename, int size, double time) {
-    FILE *fp;
-    fp=fopen(filename, "a");
-    if (fp == NULL) {
-        printf("error appending to file %s..\n", filename);
-        return -1;
-    }
-    fprintf(fp, "%d  %.3f\n", size, time);
-    fclose(fp);
-    return 0;
+  FILE *fp;
+  fp = fopen(filename, "a");
+  if (fp == NULL) {
+    printf("error appending to file %s..\n", filename);
+    return -1;
+  }
+  fprintf(fp, "%d  %.3f\n", size, time);
+  fclose(fp);
+  return 0;
 }
 
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
-    LatticeDim ret;
+LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h) {
+  LatticeDim ret;
 
-    ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-    ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-    ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
-    ret.lo = lo;
-    ret.h = h;
+  ret.nx = (int)floorf((hi.x - lo.x) / h) + 1;
+  ret.ny = (int)floorf((hi.y - lo.y) / h) + 1;
+  ret.nz = (int)floorf((hi.z - lo.z) / h) + 1;
+  ret.lo = lo;
+  ret.h = h;
 
-    return ret;
+  return ret;
 }
 
-Lattice *
-create_lattice(LatticeDim dim)
-{
-    int size;
-    Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
+Lattice *create_lattice(LatticeDim dim) {
+  int size;
+  Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
 
-    if (lat == NULL) {
-        fprintf(stderr, "Out of memory\n");
-        exit(1);
-    }
+  if (lat == NULL) {
+    fprintf(stderr, "Out of memory\n");
+    exit(1);
+  }
 
-    lat->dim = dim;
+  lat->dim = dim;
 
-    /* Round up the allocated size to a multiple of 8 */
-    size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
-    lat->lattice = (float *)calloc(size, sizeof(float));
+  /* Round up the allocated size to a multiple of 8 */
+  size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
+  lat->lattice = (float *)calloc(size, sizeof(float));
 
-    if (lat->lattice == NULL) {
-        fprintf(stderr, "Out of memory\n");
-        exit(1);
-    }
+  if (lat->lattice == NULL) {
+    fprintf(stderr, "Out of memory\n");
+    exit(1);
+  }
 
-    return lat;
+  return lat;
 }
 
-
-void
-destroy_lattice(Lattice *lat)
-{
-    if (lat) {
-        free(lat->lattice);
-        free(lat);
-    }
+void destroy_lattice(Lattice *lat) {
+  if (lat) {
+    free(lat->lattice);
+    free(lat);
+  }
 }
 
 int main(int argc, char *argv[]) {
-    Atoms *atom;
+  Atoms *atom;
 
-    LatticeDim lattice_dim;
-    Lattice *gpu_lattice;
-    Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-    Vec3 lo, hi;			/* Bounding box with padding  */
+  LatticeDim lattice_dim;
+  Lattice *gpu_lattice;
+  Vec3 min_ext, max_ext; /* Bounding box of atoms */
+  Vec3 lo, hi;           /* Bounding box with padding  */
 
-    float h = 0.5f;		/* Lattice spacing */
-    float cutoff = 12.f;		/* Cutoff radius */
-    float exclcutoff = 1.f;	/* Radius for exclusion */
-    float padding = 0.5f;		/* Bounding box padding distance */
+  float h = 0.5f;         /* Lattice spacing */
+  float cutoff = 12.f;    /* Cutoff radius */
+  float exclcutoff = 1.f; /* Radius for exclusion */
+  float padding = 0.5f;   /* Bounding box padding distance */
 
-    int n;
+  int n;
 
-    struct pb_Parameters *parameters;
-    struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+  struct pb_TimerSet timers;
 
-    /* Read input parameters */
-    parameters = pb_ReadParameters(&argc, argv);
-    if (parameters == NULL) {
-        exit(1);
-    }
-
-    /* Expect one input file */
-    if (pb_Parameters_CountInputs(parameters) != 1) {
-        fprintf(stderr, "Expecting one input file\n");
-        exit(1);
-    }
-
-
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  /* Read input parameters */
+  parameters = pb_ReadParameters(&argc, argv);
+  if (parameters == NULL) {
+    exit(1);
+  }
 
-    {
-        const char *pqrfilename = parameters->inpFiles[0];
-
-        if (!(atom = read_atom_file(pqrfilename))) {
-            fprintf(stderr, "read_atom_file() failed\n");
-            exit(1);
-        }
-        printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
-    }
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    /* find extent of domain */
-    get_atom_extent(&min_ext, &max_ext, atom);
-    printf("extent of domain is:\n");
-    printf("  minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
-    printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
-
-    printf("padding domain by %g Angstroms\n", padding);
-    lo = (Vec3) {
-        min_ext.x - padding, min_ext.y - padding, min_ext.z - padding
-    };
-    hi = (Vec3) {
-        max_ext.x + padding, max_ext.y + padding, max_ext.z + padding
-    };
-    printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
-
-    lattice_dim = lattice_from_bounding_box(lo, hi, h);
-    gpu_lattice = create_lattice(lattice_dim);
-
-    /*
-     *  OpenCL kernel, with overlapped GPU/CPU computation
-     *  (Enter and exit the function with the COMPUTE timer active)
-     */
-    if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff, atom, 0)) {
-        fprintf(stderr, "Computation failed\n");
-        exit(1);
-    }
+  /* Expect one input file */
+  if (pb_Parameters_CountInputs(parameters) != 1) {
+    fprintf(stderr, "Expecting one input file\n");
+    exit(1);
+  }
 
-    /*
-     * Zero the lattice points that are too close to an atom.  This is
-     * necessary for numerical stability.
-     */
-    if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
-        fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
-        exit(1);
-    }
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-    
+  {
+    const char *pqrfilename = parameters->inpFiles[0];
 
-    /* Print output */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if (parameters->outFile) {
-        write_lattice_summary(parameters->outFile, gpu_lattice);
+    if (!(atom = read_atom_file(pqrfilename))) {
+      fprintf(stderr, "read_atom_file() failed\n");
+      exit(1);
     }
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    /* Cleanup */
-    destroy_lattice(gpu_lattice);
-    free_atom(atom);
-
-    pb_FreeParameters(parameters);
-    return 0;
+    printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  /* find extent of domain */
+  get_atom_extent(&min_ext, &max_ext, atom);
+  printf("extent of domain is:\n");
+  printf("  minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
+  printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
+
+  printf("padding domain by %g Angstroms\n", padding);
+  lo = (Vec3){min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3){max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x - lo.x, hi.y - lo.y,
+         hi.z - lo.z);
+
+  lattice_dim = lattice_from_bounding_box(lo, hi, h);
+  gpu_lattice = create_lattice(lattice_dim);
+
+  /*
+   *  OpenCL kernel, with overlapped GPU/CPU computation
+   *  (Enter and exit the function with the COMPUTE timer active)
+   */
+  if (gpu_compute_cutoff_potential_lattice6overlap(&timers, gpu_lattice, cutoff,
+                                                   atom, 0)) {
+    fprintf(stderr, "Computation failed\n");
+    exit(1);
+  }
+
+  /*
+   * Zero the lattice points that are too close to an atom.  This is
+   * necessary for numerical stability.
+   */
+  if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
+    fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
+    exit(1);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
+
+  /* Print output */
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (parameters->outFile) {
+    write_lattice_summary(parameters->outFile, gpu_lattice);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  /* Cleanup */
+  destroy_lattice(gpu_lattice);
+  free_atom(atom);
+
+  pb_FreeParameters(parameters);
+  return 0;
 }
 
 int gpu_compute_cutoff_potential_lattice6overlap(
-    struct pb_TimerSet *timers,        /* for measuring execution time */
-    Lattice *lattice,
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms,                      /* array of atoms */
-    int verbose                        /* print info/debug messages */
-)
-{
-    int nx = lattice->dim.nx;
-    int ny = lattice->dim.ny;
-    int nz = lattice->dim.nz;
-    float xlo = lattice->dim.lo.x;
-    float ylo = lattice->dim.lo.y;
-    float zlo = lattice->dim.lo.z;
-    float h = lattice->dim.h;
-    int natoms = atoms->size;
-    Atom *atom = atoms->atoms;
-
-    xyz nbrlist[NBRLIST_MAXLEN];
-    size_t bytes_nbrlist = sizeof(xyz) * NBRLIST_MAXLEN;
-    int* nbrlistlen = (int*) malloc(sizeof(int));
-
-    int binHistoFull[BIN_DEPTH+1] = { 0 };   /* clear every array element */
-    int binHistoCover[BIN_DEPTH+1] = { 0 };  /* clear every array element */
-    int num_excluded = 0;
-
-    int xRegionDim, yRegionDim, zRegionDim;
-    int xRegionIndex, yRegionIndex, zRegionIndex;
-    int xOffset, yOffset, zOffset;
-    int lnx, lny, lnz, lnall;
-    ener_t *regionZeroAddr, *thisRegion;
-    int index, indexRegion;
-
-    int c;
-    xyz binDim;
-    int nbins;
-    float4 *binBaseAddr, *binZeroAddr;
-    int *bincntBaseAddr, *bincntZeroAddr;
-    Atoms *extra = NULL;
-
-    int i, j, k, n;
-    int sum, total;
-
-    float avgFillFull, avgFillCover;
-    const float cutoff2 = cutoff * cutoff;
-    const float inv_cutoff2 = 1.f / cutoff2;
-
-    long gridDim[3], blockDim[3];
+    struct pb_TimerSet *timers,     /* for measuring execution time */
+    Lattice *lattice, float cutoff, /* cutoff distance */
+    Atoms *atoms,                   /* array of atoms */
+    int verbose                     /* print info/debug messages */
+) {
+  int nx = lattice->dim.nx;
+  int ny = lattice->dim.ny;
+  int nz = lattice->dim.nz;
+  float xlo = lattice->dim.lo.x;
+  float ylo = lattice->dim.lo.y;
+  float zlo = lattice->dim.lo.z;
+  float h = lattice->dim.h;
+  int natoms = atoms->size;
+  Atom *atom = atoms->atoms;
+
+  xyz nbrlist[NBRLIST_MAXLEN];
+  size_t bytes_nbrlist = sizeof(xyz) * NBRLIST_MAXLEN;
+  int *nbrlistlen = (int *)malloc(sizeof(int));
+
+  int binHistoFull[BIN_DEPTH + 1] = {0};  /* clear every array element */
+  int binHistoCover[BIN_DEPTH + 1] = {0}; /* clear every array element */
+  int num_excluded = 0;
+
+  int xRegionDim, yRegionDim, zRegionDim;
+  int xRegionIndex, yRegionIndex, zRegionIndex;
+  int xOffset, yOffset, zOffset;
+  int lnx, lny, lnz, lnall;
+  ener_t *regionZeroAddr, *thisRegion;
+  int index, indexRegion;
+
+  int c;
+  xyz binDim;
+  int nbins;
+  float4 *binBaseAddr, *binZeroAddr;
+  int *bincntBaseAddr, *bincntZeroAddr;
+  Atoms *extra = NULL;
+
+  int i, j, k, n;
+  int sum, total;
+
+  float avgFillFull, avgFillCover;
+  const float cutoff2 = cutoff * cutoff;
+  const float inv_cutoff2 = 1.f / cutoff2;
+
+  long gridDim[3], blockDim[3];
 
 #ifdef NEIGHBOR_COUNT
-    double neighbor_count = 0;	/* used to profile the number of atoms near a
-				 * lattice point */
+  double neighbor_count = 0; /* used to profile the number of atoms near a
+                              * lattice point */
 #endif
 
-    // Caller has made the "compute" timer active
-
-    /* pad lattice to be factor of 8 in each dimension */
-    xRegionDim = (int) ceilf(nx/8.f);
-    yRegionDim = (int) ceilf(ny/8.f);
-    zRegionDim = (int) ceilf(nz/8.f);
-
-    lnx = 8 * xRegionDim;
-    lny = 8 * yRegionDim;
-    lnz = 8 * zRegionDim;
-    lnall = lnx * lny * lnz;
-
-    /* will receive energies from OpenCL */
-    size_t bytes_regionZeroAddr = lnall * sizeof(ener_t);
-    regionZeroAddr = (ener_t *) malloc(bytes_regionZeroAddr);
-
-    /* create bins */
-    c = (int) ceil(cutoff * BIN_INVLEN);  /* count extra bins around lattice */
-    binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
-    binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
-    binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
-    nbins = binDim.x * binDim.y * binDim.z;
-    binBaseAddr = (float4 *) calloc(nbins * BIN_DEPTH, sizeof(float4));
-    size_t bytes_binBaseAddr = nbins * BIN_DEPTH * sizeof(float4);
-
-    binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
-
-    bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
-    bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
-
-    /* create neighbor list */
-    if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
-        float s = sqrtf(3);
-        float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
-        int cnt = 0;
-        /* develop neighbor list around 1 cell */
-        if (2*c + 1 > NBRLIST_DIM) {
-            fprintf(stderr, "must have cutoff <= %f\n",
-                    (NBRLIST_DIM-1)/2 * BIN_LENGTH);
-            return -1;
-        }
-        for (k = -c;  k <= c;  k++) {
-            for (j = -c;  j <= c;  j++) {
-                for (i = -c;  i <= c;  i++) {
-                    if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
-                    nbrlist[cnt].x = i;
-                    nbrlist[cnt].y = j;
-                    nbrlist[cnt].z = k;
-                    cnt++;
-                }
-            }
-        }
-        *nbrlistlen = cnt;
+  // Caller has made the "compute" timer active
+
+  /* pad lattice to be factor of 8 in each dimension */
+  xRegionDim = (int)ceilf(nx / 8.f);
+  yRegionDim = (int)ceilf(ny / 8.f);
+  zRegionDim = (int)ceilf(nz / 8.f);
+
+  lnx = 8 * xRegionDim;
+  lny = 8 * yRegionDim;
+  lnz = 8 * zRegionDim;
+  lnall = lnx * lny * lnz;
+
+  /* will receive energies from OpenCL */
+  size_t bytes_regionZeroAddr = lnall * sizeof(ener_t);
+  regionZeroAddr = (ener_t *)malloc(bytes_regionZeroAddr);
+
+  /* create bins */
+  c = (int)ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
+  binDim.x = (int)ceil(lnx * h * BIN_INVLEN) + 2 * c;
+  binDim.y = (int)ceil(lny * h * BIN_INVLEN) + 2 * c;
+  binDim.z = (int)ceil(lnz * h * BIN_INVLEN) + 2 * c;
+  nbins = binDim.x * binDim.y * binDim.z;
+  binBaseAddr = (float4 *)calloc(nbins * BIN_DEPTH, sizeof(float4));
+  size_t bytes_binBaseAddr = nbins * BIN_DEPTH * sizeof(float4);
+
+  binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
+
+  bincntBaseAddr = (int *)calloc(nbins, sizeof(int));
+  bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
+
+  /* create neighbor list */
+  if (ceilf(BIN_LENGTH / (8 * h)) == floorf(BIN_LENGTH / (8 * h))) {
+    float s = sqrtf(3);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
+    int cnt = 0;
+    /* develop neighbor list around 1 cell */
+    if (2 * c + 1 > NBRLIST_DIM) {
+      fprintf(stderr, "must have cutoff <= %f\n",
+              (NBRLIST_DIM - 1) / 2 * BIN_LENGTH);
+      return -1;
     }
-    else if (8*h <= 2*BIN_LENGTH) {
-        float s = 2.f*sqrtf(3);
-        float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
-        int cnt = 0;
-        /* develop neighbor list around 3-cube of cells */
-        if (2*c + 3 > NBRLIST_DIM) {
-            fprintf(stderr, "must have cutoff <= %f\n",
-                    (NBRLIST_DIM-3)/2 * BIN_LENGTH);
-            return -1;
-        }
-        for (k = -c;  k <= c;  k++) {
-            for (j = -c;  j <= c;  j++) {
-                for (i = -c;  i <= c;  i++) {
-                    if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
-                    nbrlist[cnt].x = i;
-                    nbrlist[cnt].y = j;
-                    nbrlist[cnt].z = k;
-                    cnt++;
-                }
-            }
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
+          nbrlist[cnt].x = i;
+          nbrlist[cnt].y = j;
+          nbrlist[cnt].z = k;
+          cnt++;
         }
-        *nbrlistlen = cnt;
+      }
     }
-    else {
-        fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
-        return -1;
+    *nbrlistlen = cnt;
+  } else if (8 * h <= 2 * BIN_LENGTH) {
+    float s = 2.f * sqrtf(3);
+    float r2 = (cutoff + s * BIN_LENGTH) * (cutoff + s * BIN_LENGTH);
+    int cnt = 0;
+    /* develop neighbor list around 3-cube of cells */
+    if (2 * c + 3 > NBRLIST_DIM) {
+      fprintf(stderr, "must have cutoff <= %f\n",
+              (NBRLIST_DIM - 3) / 2 * BIN_LENGTH);
+      return -1;
     }
-
-    /* perform geometric hashing of atoms into bins */
-    {
-        /* array of extra atoms, permit average of one extra per bin */
-        Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
-        int extra_len = 0;
-
-        for (n = 0;  n < natoms;  n++) {
-            float4 p;
-            p.x = atom[n].x - xlo;
-            p.y = atom[n].y - ylo;
-            p.z = atom[n].z - zlo;
-            p.w = atom[n].q;
-            i = (int) floorf(p.x * BIN_INVLEN);
-            j = (int) floorf(p.y * BIN_INVLEN);
-            k = (int) floorf(p.z * BIN_INVLEN);
-            if (i >= -c && i < binDim.x - c &&
-                    j >= -c && j < binDim.y - c &&
-                    k >= -c && k < binDim.z - c &&
-                    atom[n].q != 0) {
-                int index = (k * binDim.y + j) * binDim.x + i;
-                float4 *bin = binZeroAddr + index * BIN_DEPTH;
-                int bindex = bincntZeroAddr[index];
-                if (bindex < BIN_DEPTH) {
-                    /* copy atom into bin and increase counter for this bin */
-                    bin[bindex] = p;
-                    bincntZeroAddr[index]++;
-                }
-                else {
-                    /* add index to array of extra atoms to be computed with CPU */
-                    if (extra_len >= nbins) {
-                        fprintf(stderr, "exceeded space for storing extra atoms\n");
-                        return -1;
-                    }
-                    extra_atoms[extra_len] = atom[n];
-                    extra_len++;
-                }
-            }
-            else {
-                /* excluded atoms are either outside bins or neutrally charged */
-                num_excluded++;
-            }
+    for (k = -c; k <= c; k++) {
+      for (j = -c; j <= c; j++) {
+        for (i = -c; i <= c; i++) {
+          if ((i * i + j * j + k * k) * BIN_LENGTH * BIN_LENGTH >= r2)
+            continue;
+          nbrlist[cnt].x = i;
+          nbrlist[cnt].y = j;
+          nbrlist[cnt].z = k;
+          cnt++;
         }
-
-        /* Save result */
-        extra = (Atoms *)malloc(sizeof(Atoms));
-        extra->atoms = extra_atoms;
-        extra->size = extra_len;
+      }
+    }
+    *nbrlistlen = cnt;
+  } else {
+    fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
+    return -1;
+  }
+
+  /* perform geometric hashing of atoms into bins */
+  {
+    /* array of extra atoms, permit average of one extra per bin */
+    Atom *extra_atoms = (Atom *)calloc(nbins, sizeof(Atom));
+    int extra_len = 0;
+
+    for (n = 0; n < natoms; n++) {
+      float4 p;
+      p.x = atom[n].x - xlo;
+      p.y = atom[n].y - ylo;
+      p.z = atom[n].z - zlo;
+      p.w = atom[n].q;
+      i = (int)floorf(p.x * BIN_INVLEN);
+      j = (int)floorf(p.y * BIN_INVLEN);
+      k = (int)floorf(p.z * BIN_INVLEN);
+      if (i >= -c && i < binDim.x - c && j >= -c && j < binDim.y - c &&
+          k >= -c && k < binDim.z - c && atom[n].q != 0) {
+        int index = (k * binDim.y + j) * binDim.x + i;
+        float4 *bin = binZeroAddr + index * BIN_DEPTH;
+        int bindex = bincntZeroAddr[index];
+        if (bindex < BIN_DEPTH) {
+          /* copy atom into bin and increase counter for this bin */
+          bin[bindex] = p;
+          bincntZeroAddr[index]++;
+        } else {
+          /* add index to array of extra atoms to be computed with CPU */
+          if (extra_len >= nbins) {
+            fprintf(stderr, "exceeded space for storing extra atoms\n");
+            return -1;
+          }
+          extra_atoms[extra_len] = atom[n];
+          extra_len++;
+        }
+      } else {
+        /* excluded atoms are either outside bins or neutrally charged */
+        num_excluded++;
+      }
     }
 
-    /* bin stats */
-    sum = total = 0;
-    for (n = 0;  n < nbins;  n++) {
-        binHistoFull[ bincntBaseAddr[n] ]++;
-        sum += bincntBaseAddr[n];
+    /* Save result */
+    extra = (Atoms *)malloc(sizeof(Atoms));
+    extra->atoms = extra_atoms;
+    extra->size = extra_len;
+  }
+
+  /* bin stats */
+  sum = total = 0;
+  for (n = 0; n < nbins; n++) {
+    binHistoFull[bincntBaseAddr[n]]++;
+    sum += bincntBaseAddr[n];
+    total += BIN_DEPTH;
+  }
+  avgFillFull = sum / (float)total;
+  sum = total = 0;
+  for (k = 0; k < binDim.z - 2 * c; k++) {
+    for (j = 0; j < binDim.y - 2 * c; j++) {
+      for (i = 0; i < binDim.x - 2 * c; i++) {
+        int index = (k * binDim.y + j) * binDim.x + i;
+        binHistoCover[bincntZeroAddr[index]]++;
+        sum += bincntZeroAddr[index];
         total += BIN_DEPTH;
+      }
     }
-    avgFillFull = sum / (float) total;
-    sum = total = 0;
-    for (k = 0;  k < binDim.z - 2*c;  k++) {
-        for (j = 0;  j < binDim.y - 2*c;  j++) {
-            for (i = 0;  i < binDim.x - 2*c;  i++) {
-                int index = (k * binDim.y + j) * binDim.x + i;
-                binHistoCover[ bincntZeroAddr[index] ]++;
-                sum += bincntZeroAddr[index];
-                total += BIN_DEPTH;
-            }
-        }
+  }
+  avgFillCover = sum / (float)total;
+
+  if (verbose) {
+    /* report */
+    printf("number of atoms = %d\n", natoms);
+    printf("lattice spacing = %g\n", h);
+    printf("cutoff distance = %g\n", cutoff);
+    printf("\n");
+    printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
+    printf("requested space dimensions = %g %g %g\n", nx * h, ny * h, nz * h);
+    printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
+    printf("expanded space dimensions = %g %g %g\n", lnx * h, lny * h, lnz * h);
+    printf("number of bytes for lattice data = %lu\n", lnall * sizeof(float));
+    printf("\n");
+    printf("bin padding thickness = %d\n", c);
+    printf("bin cover dimensions = %d %d %d\n", binDim.x - 2 * c,
+           binDim.y - 2 * c, binDim.z - 2 * c);
+    printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
+    printf("number of bins = %d\n", nbins);
+    printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
+    printf("%% overhead space = %g\n",
+           (natoms / (double)(nbins * BIN_DEPTH)) * 100);
+    printf("number of bytes for bin data = %lu\n",
+           nbins * BIN_DEPTH * sizeof(float4));
+    printf("\n");
+    printf("bin histogram with padding:\n");
+    sum = 0;
+    for (n = 0; n <= BIN_DEPTH; n++) {
+      printf("     number of bins with %d atoms:  %d\n", n, binHistoFull[n]);
+      sum += binHistoFull[n];
     }
-    avgFillCover = sum / (float) total;
-
-    if (verbose) {
-        /* report */
-        printf("number of atoms = %d\n", natoms);
-        printf("lattice spacing = %g\n", h);
-        printf("cutoff distance = %g\n", cutoff);
-        printf("\n");
-        printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
-        printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
-        printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
-        printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
-        printf("number of bytes for lattice data = %lu\n", lnall*sizeof(float));
-        printf("\n");
-        printf("bin padding thickness = %d\n", c);
-        printf("bin cover dimensions = %d %d %d\n",
-               binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
-        printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
-        printf("number of bins = %d\n", nbins);
-        printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
-        printf("%% overhead space = %g\n",
-               (natoms / (double) (nbins * BIN_DEPTH)) * 100);
-        printf("number of bytes for bin data = %lu\n",
-               nbins * BIN_DEPTH * sizeof(float4));
-        printf("\n");
-        printf("bin histogram with padding:\n");
-        sum = 0;
-        for (n = 0;  n <= BIN_DEPTH;  n++) {
-            printf("     number of bins with %d atoms:  %d\n", n, binHistoFull[n]);
-            sum += binHistoFull[n];
-        }
-        printf("     total number of bins:  %d\n", sum);
-        printf("     %% average fill:  %g\n", avgFillFull * 100);
-        printf("\n");
-        printf("bin histogram excluding padding:\n");
-        sum = 0;
-        for (n = 0;  n <= BIN_DEPTH;  n++) {
-            printf("     number of bins with %d atoms:  %d\n", n, binHistoCover[n]);
-            sum += binHistoCover[n];
-        }
-        printf("     total number of bins:  %d\n", sum);
-        printf("     %% average fill:  %g\n", avgFillCover * 100);
-        printf("\n");
-        printf("number of extra atoms = %d\n", extra->size);
-        printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
-        printf("\n");
-
-        /* sanity check on bins */
-        sum = 0;
-        for (n = 0;  n <= BIN_DEPTH;  n++) {
-            sum += n * binHistoFull[n];
-        }
-        sum += extra->size + num_excluded;
-        printf("sanity check on bin histogram with edges:  "
-               "sum + others = %d\n", sum);
-        sum = 0;
-        for (n = 0;  n <= BIN_DEPTH;  n++) {
-            sum += n * binHistoCover[n];
-        }
-        sum += extra->size + num_excluded;
-        printf("sanity check on bin histogram excluding edges:  "
-               "sum + others = %d\n", sum);
-        printf("\n");
-
-        /* neighbor list */
-        printf("neighbor list length = %d\n", *nbrlistlen);
-        printf("\n");
+    printf("     total number of bins:  %d\n", sum);
+    printf("     %% average fill:  %g\n", avgFillFull * 100);
+    printf("\n");
+    printf("bin histogram excluding padding:\n");
+    sum = 0;
+    for (n = 0; n <= BIN_DEPTH; n++) {
+      printf("     number of bins with %d atoms:  %d\n", n, binHistoCover[n]);
+      sum += binHistoCover[n];
     }
-
-    // Track visc data
-    llvm_visc_track_mem(regionZeroAddr, bytes_regionZeroAddr);
-    llvm_visc_track_mem(binBaseAddr, bytes_binBaseAddr);
-    llvm_visc_track_mem(nbrlistlen, sizeof(int));
-    llvm_visc_track_mem(nbrlist, bytes_nbrlist);
-
-    /* setup OpenCL kernel parameters */
-    blockDim[0] = 8;
-    blockDim[1] = 2;
-    blockDim[2] = 8;
-    gridDim[0] = xRegionDim;
-    gridDim[1] = yRegionDim;
-    gridDim[2] = 1;
-
-    /* allocate and initialize memory on OpenCL device */
-    if (verbose) {
-        printf("Allocating %.2fMB on OpenCL device for potentials\n",
-               lnall * sizeof(float) / (double) (1024*1024));
+    printf("     total number of bins:  %d\n", sum);
+    printf("     %% average fill:  %g\n", avgFillCover * 100);
+    printf("\n");
+    printf("number of extra atoms = %d\n", extra->size);
+    printf("%% atoms that are extra = %g\n",
+           (extra->size / (double)natoms) * 100);
+    printf("\n");
+
+    /* sanity check on bins */
+    sum = 0;
+    for (n = 0; n <= BIN_DEPTH; n++) {
+      sum += n * binHistoFull[n];
     }
-
-    memset(regionZeroAddr,0,lnall*sizeof(ener_t));
-
-    if (verbose) {
-        printf("Allocating %.2fMB on OpenCL device for atom bins\n",
-               nbins * BIN_DEPTH * sizeof(float4) / (double) (1024*1024));
+    sum += extra->size + num_excluded;
+    printf("sanity check on bin histogram with edges:  "
+           "sum + others = %d\n",
+           sum);
+    sum = 0;
+    for (n = 0; n <= BIN_DEPTH; n++) {
+      sum += n * binHistoCover[n];
     }
-
-    //Sub buffers are not supported in OpenCL v1.0
-    int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
-
-    if (verbose)
-        printf("\n");
-
-
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData( args,
-              binDim.x,
-              binDim.y,
-              binBaseAddr,
-              bytes_binBaseAddr,
-              offset,
-              h,
-              cutoff2,
-              inv_cutoff2,
-              regionZeroAddr,
-              bytes_regionZeroAddr,
-              zRegionIndex,
-              nbrlistlen,
-              (size_t )sizeof(int),
-              nbrlist,
-              bytes_nbrlist,
-              blockDim[0],
-              blockDim[1],
-              blockDim[2],
-              gridDim[0],
-              gridDim[1],
-              gridDim[2]
-            );
-
-    /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
-    pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
-    void* CUTCP_DFG;
-    if(verbose)
-      printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
-    for (zRegionIndex = 0;  zRegionIndex < zRegionDim;  zRegionIndex++) {
+    sum += extra->size + num_excluded;
+    printf("sanity check on bin histogram excluding edges:  "
+           "sum + others = %d\n",
+           sum);
+    printf("\n");
+
+    /* neighbor list */
+    printf("neighbor list length = %d\n", *nbrlistlen);
+    printf("\n");
+  }
+
+  // Track visc data
+  llvm_visc_track_mem(regionZeroAddr, bytes_regionZeroAddr);
+  llvm_visc_track_mem(binBaseAddr, bytes_binBaseAddr);
+  llvm_visc_track_mem(nbrlistlen, sizeof(int));
+  llvm_visc_track_mem(nbrlist, bytes_nbrlist);
+
+  /* setup OpenCL kernel parameters */
+  blockDim[0] = 8;
+  blockDim[1] = 2;
+  blockDim[2] = 8;
+  gridDim[0] = xRegionDim;
+  gridDim[1] = yRegionDim;
+  gridDim[2] = 1;
+
+  /* allocate and initialize memory on OpenCL device */
+  if (verbose) {
+    printf("Allocating %.2fMB on OpenCL device for potentials\n",
+           lnall * sizeof(float) / (double)(1024 * 1024));
+  }
+
+  memset(regionZeroAddr, 0, lnall * sizeof(ener_t));
+
+  if (verbose) {
+    printf("Allocating %.2fMB on OpenCL device for atom bins\n",
+           nbins * BIN_DEPTH * sizeof(float4) / (double)(1024 * 1024));
+  }
+
+  // Sub buffers are not supported in OpenCL v1.0
+  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
+
+  if (verbose)
+    printf("\n");
+
+  RootIn *args = (RootIn *)malloc(sizeof(RootIn));
+  packData(args, binDim.x, binDim.y, binBaseAddr, bytes_binBaseAddr, offset, h,
+           cutoff2, inv_cutoff2, regionZeroAddr, bytes_regionZeroAddr,
+           zRegionIndex, nbrlistlen, (size_t)sizeof(int), nbrlist,
+           bytes_nbrlist, blockDim[0], blockDim[1], blockDim[2], gridDim[0],
+           gridDim[1], gridDim[2]);
+
+  /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
+  pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
+  void *CUTCP_DFG;
+  if (verbose)
+    printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
+  for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
 #ifndef NO_DEBUG
-        printf("  computing plane %d\n", zRegionIndex);
-        fflush(stdout);
+    printf("  computing plane %d\n", zRegionIndex);
+    fflush(stdout);
 #endif
 
-        args->zRegionIndex = zRegionIndex;
-
-        CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void*)args);
-        __visc__wait(CUTCP_DFG);
-        //llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t));
-    }
-
-    /*
-     * handle extra atoms on the CPU, concurrently with the GPU calculations
-     */
+    args->zRegionIndex = zRegionIndex;
 
-    pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-    if (extra->size > 0) {
-        if(verbose) {
-          printf("computing extra atoms on CPU\n");
-        }
+    CUTCP_DFG = __visc__launch(0, CUTCPWrapper, (void *)args);
+    __visc__wait(CUTCP_DFG);
+    // llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t));
+  }
 
-        pb_SwitchToTimer(timers, visc_TimerID_MISC);
+  /*
+   * handle extra atoms on the CPU, concurrently with the GPU calculations
+   */
 
-        if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
-            fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
-                    "for extra atoms\n");
-            return -1;
-        }
-        pb_SwitchToTimer(timers, visc_TimerID_MISC);
-        printf("\n");
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
+  if (extra->size > 0) {
+    if (verbose) {
+      printf("computing extra atoms on CPU\n");
     }
-    if(verbose)
-      printf("Finished OpenCL kernel calls                        \n");
-
-    /* copy result regions from OpenCL device */
-    pb_SwitchToTimer(timers, pb_TimerID_COPY);
-
-    llvm_visc_request_mem(regionZeroAddr, lnall*sizeof(ener_t));
 
-    /*
-     * transpose on CPU, updating, producing the final lattice
-     */
-    /* transpose regions back into lattice */
-    pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-    for (k = 0;  k < nz;  k++) {
-        zRegionIndex = (k >> 3);
-        zOffset = (k & 7);
+    pb_SwitchToTimer(timers, visc_TimerID_MISC);
 
-        for (j = 0;  j < ny;  j++) {
-            yRegionIndex = (j >> 3);
-            yOffset = (j & 7);
-
-            for (i = 0;  i < nx;  i++) {
-                xRegionIndex = (i >> 3);
-                xOffset = (i & 7);
-
-                thisRegion = regionZeroAddr
-                             + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
-                                + xRegionIndex) * REGION_SIZE;
-
-                indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
-                index = (k * ny + j) * nx + i;
+    if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
+      fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
+                      "for extra atoms\n");
+      return -1;
+    }
+    pb_SwitchToTimer(timers, visc_TimerID_MISC);
+    printf("\n");
+  }
+  if (verbose)
+    printf("Finished OpenCL kernel calls                        \n");
+
+  /* copy result regions from OpenCL device */
+  pb_SwitchToTimer(timers, pb_TimerID_COPY);
+
+  llvm_visc_request_mem(regionZeroAddr, lnall * sizeof(ener_t));
+
+  /*
+   * transpose on CPU, updating, producing the final lattice
+   */
+  /* transpose regions back into lattice */
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
+  for (k = 0; k < nz; k++) {
+    zRegionIndex = (k >> 3);
+    zOffset = (k & 7);
+
+    for (j = 0; j < ny; j++) {
+      yRegionIndex = (j >> 3);
+      yOffset = (j & 7);
+
+      for (i = 0; i < nx; i++) {
+        xRegionIndex = (i >> 3);
+        xOffset = (i & 7);
+
+        thisRegion = regionZeroAddr +
+                     ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim +
+                      xRegionIndex) *
+                         REGION_SIZE;
+
+        indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
+        index = (k * ny + j) * nx + i;
 
 #ifndef NEIGHBOR_COUNT
-                lattice->lattice[index] += thisRegion[indexRegion];
+        lattice->lattice[index] += thisRegion[indexRegion];
 #else
-                neighbor_count += thisRegion[indexRegion];
+        neighbor_count += thisRegion[indexRegion];
 #endif
-            }
-        }
+      }
     }
+  }
 
 #ifdef NEIGHBOR_COUNT
-    printf("Neighbor count: %f\n", (float)neighbor_count);
+  printf("Neighbor count: %f\n", (float)neighbor_count);
 #endif
 
-    /* cleanup memory allocations */
-    free(regionZeroAddr);
-    free(binBaseAddr);
-    free(bincntBaseAddr);
-    free_atom(extra);
+  /* cleanup memory allocations */
+  free(regionZeroAddr);
+  free(binBaseAddr);
+  free(bincntBaseAddr);
+  free_atom(extra);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h
index b88103818f6499a3cdddd40ff3d5ac345d2762f1..a88ee486f16f0452ec9894a3b2b28d9e961d417e 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/ocl.h
@@ -2,14 +2,13 @@
 #define __OCLH__
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c
index ac45761fb86afd598dfe24f2ecead5622cf00954..145f59cc065131db3461a04f9674a94afbf0cfb5 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.c
@@ -6,18 +6,16 @@
  *cr
  ***************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
 #include <inttypes.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #include "atom.h"
 #include "cutoff.h"
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
+void write_lattice_summary(const char *filename, Lattice *lattice) {
   float *lattice_data = lattice->lattice;
   int nx = lattice->dim.nx;
   int ny = lattice->dim.ny;
@@ -38,21 +36,21 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int i;
 
     for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
+      abspotential += fabs((double)lattice_data[i]);
 
-    tmp = (float) abspotential;
+    tmp = (float)abspotential;
 
     fwrite(&tmp, 1, sizeof(float), outfile);
-    //fprintf(outfile,"%f\n",tmp);
+    // fprintf(outfile,"%f\n",tmp);
   }
 
   /* Write the size of a lattice plane */
   {
     uint32_t tmp;
 
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    tmp = (uint32_t)(lattice->dim.nx * lattice->dim.ny);
     fwrite(&tmp, 1, sizeof(uint32_t), outfile);
-    //fprintf(outfile,"%u\n",tmp);
+    // fprintf(outfile,"%u\n",tmp);
   }
 
   /* Write the plane of lattice data at z=0 and z = nz-1 */
@@ -60,11 +58,11 @@ write_lattice_summary(const char *filename, Lattice *lattice)
     int plane_size = nx * ny;
 
     fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
-//int i;
-   //for(i=0;i<100;i++)
-	//fprintf(outfile,"%f ",lattice_data[i]);
+    fwrite(lattice_data + (nz - 1) * plane_size, plane_size, sizeof(float),
+           outfile);
+    // int i;
+    // for(i=0;i<100;i++)
+    // fprintf(outfile,"%f ",lattice_data[i]);
   }
 
   /* Cleanup */
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h
index 2ddd39227e6c043207897e923f9c7076452eff52..78a5f846e2feda2d1142ae0e1ea4f5edb4eb5ad6 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/output.h
@@ -15,8 +15,7 @@
 extern "C" {
 #endif
 
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
+void write_lattice_summary(const char *filename, Lattice *lattice);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c b/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c
index b9ede0e39b229a195da42e1197a2588ac8a7f190..7a04360a70c40ac50cd72fb218aed5f216247e91 100644
--- a/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c
+++ b/hpvm/test/parboil/benchmarks/cutcp/src/visc/readatom.c
@@ -6,36 +6,33 @@
  *cr
  ***************************************************************************/
 
+#include "atom.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
-#include "atom.h"
-
 
 #define LINELEN 96
 #define INITLEN 20
 
-
-Atoms *read_atom_file(const char *fname)
-{
+Atoms *read_atom_file(const char *fname) {
   FILE *file;
   char line[LINELEN];
 
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
+  Atom *atom;        /* Atom array */
+  int len = INITLEN; /* Size of atom array */
+  int cnt = 0;       /* Number of atoms read */
 
   /* open atom "pqr" file */
   file = fopen(fname, "r");
-  if (NULL==file) {
+  if (NULL == file) {
     fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
     return NULL;
   }
 
   /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
+  atom = (Atom *)malloc(len * sizeof(Atom));
+  if (NULL == atom) {
     fprintf(stderr, "can't allocate memory\n");
     return NULL;
   }
@@ -44,31 +41,32 @@ Atoms *read_atom_file(const char *fname)
   while (fgets(line, LINELEN, file) != NULL) {
 
     if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
+      continue; /* skip anything that isn't an atom record */
     }
 
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
+    if (cnt == len) { /* extend atom array */
+      void *tmp = realloc(atom, 2 * len * sizeof(Atom));
+      if (NULL == tmp) {
         fprintf(stderr, "can't allocate more memory\n");
         return NULL;
       }
-      atom = (Atom *) tmp;
+      atom = (Atom *)tmp;
       len *= 2;
     }
 
     /* read position coordinates and charge from atom record */
     if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+               &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n",
+              cnt + 1);
       return NULL;
     }
 
-    cnt++;  /* count atoms as we store them */
+    cnt++; /* count atoms as we store them */
   }
 
   /* verify EOF and close file */
-  if ( !feof(file) ) {
+  if (!feof(file)) {
     fprintf(stderr, "did not find EOF\n");
     return NULL;
   }
@@ -93,18 +91,14 @@ Atoms *read_atom_file(const char *fname)
   }
 }
 
-
-void free_atom(Atoms *atom)
-{
+void free_atom(Atoms *atom) {
   if (atom) {
     free(atom->atoms);
     free(atom);
   }
 }
 
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
+void get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom) {
   Atom *atoms = atom->atoms;
   int natoms = atom->size;
   Vec3 lo;
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h
index ce9ce82c4acc351d7d239f3053023e964490eabe..0cd4bd055875c814b1712939b73179f7607043ad 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/config.h
@@ -14,7 +14,7 @@
 
 #define OMEGA (1.95f)
 
-#define OUTPUT_PRECISION float 
+#define OUTPUT_PRECISION float
 
 #define BOOL int
 #define TRUE (-1)
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c
index 81294ac4455b4a92dfe80b7cb5d0ac0696a4b027..e6ea7c4d621e8470680a125bca11f70a634f2a56 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.c
@@ -4,9 +4,8 @@
 
 #include "lbm.h"
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
-
+#include <stdlib.h>
 
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
@@ -16,674 +15,757 @@
 
 /*############################################################################*/
 
-#define DFL1 (1.0/ 3.0)
-#define DFL2 (1.0/18.0)
-#define DFL3 (1.0/36.0)
+#define DFL1 (1.0 / 3.0)
+#define DFL2 (1.0 / 18.0)
+#define DFL3 (1.0 / 36.0)
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES,
-	             size   = sizeof( LBM_Grid ) + 2*margin*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES,
+               size = sizeof(LBM_Grid) + 2 * margin * sizeof(float);
 
-	*ptr = malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-		        size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  *ptr = malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 #if !defined(SPEC_CPU)
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-	        size / (1024.0*1024.0) );
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
 #endif
-	*ptr += margin;
+  *ptr += margin;
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES;
+void LBM_freeGrid(float **ptr) {
+  const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES;
 
-	free( *ptr-margin );
-	*ptr = NULL;
+  free(*ptr - margin);
+  *ptr = NULL;
 }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 #endif
-	SWEEP_START( 0, 0, -2, 0, 0, SIZE_Z+2 )
-		LOCAL( grid, C  ) = DFL1;
-		LOCAL( grid, N  ) = DFL2;
-		LOCAL( grid, S  ) = DFL2;
-		LOCAL( grid, E  ) = DFL2;
-		LOCAL( grid, W  ) = DFL2;
-		LOCAL( grid, T  ) = DFL2;
-		LOCAL( grid, B  ) = DFL2;
-		LOCAL( grid, NE ) = DFL3;
-		LOCAL( grid, NW ) = DFL3;
-		LOCAL( grid, SE ) = DFL3;
-		LOCAL( grid, SW ) = DFL3;
-		LOCAL( grid, NT ) = DFL3;
-		LOCAL( grid, NB ) = DFL3;
-		LOCAL( grid, ST ) = DFL3;
-		LOCAL( grid, SB ) = DFL3;
-		LOCAL( grid, ET ) = DFL3;
-		LOCAL( grid, EB ) = DFL3;
-		LOCAL( grid, WT ) = DFL3;
-		LOCAL( grid, WB ) = DFL3;
-
-		CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+  SWEEP_START(0, 0, -2, 0, 0, SIZE_Z + 2)
+  LOCAL(grid, C) = DFL1;
+  LOCAL(grid, N) = DFL2;
+  LOCAL(grid, S) = DFL2;
+  LOCAL(grid, E) = DFL2;
+  LOCAL(grid, W) = DFL2;
+  LOCAL(grid, T) = DFL2;
+  LOCAL(grid, B) = DFL2;
+  LOCAL(grid, NE) = DFL3;
+  LOCAL(grid, NW) = DFL3;
+  LOCAL(grid, SE) = DFL3;
+  LOCAL(grid, SW) = DFL3;
+  LOCAL(grid, NT) = DFL3;
+  LOCAL(grid, NB) = DFL3;
+  LOCAL(grid, ST) = DFL3;
+  LOCAL(grid, SB) = DFL3;
+  LOCAL(grid, ET) = DFL3;
+  LOCAL(grid, EB) = DFL3;
+  LOCAL(grid, WT) = DFL3;
+  LOCAL(grid, WB) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 ) {
-	LBM_GridPtr aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2) {
+  LBM_GridPtr aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( x, y )
+#pragma omp parallel for private(x, y)
 #endif
 #endif
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-				    y == 0 || y == SIZE_Y-1 ||
-				    z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-					     x > 1 && x < SIZE_X-2 &&
-					     y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForChannel( LBM_Grid grid ) {
-	int x,  y,  z;
+void LBM_initializeSpecialCellsForChannel(LBM_Grid grid) {
+  int x, y, z;
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( x, y )
+#pragma omp parallel for private(x, y)
 #endif
 #endif
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-				    y == 0 || y == SIZE_Y-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-
-					if( (z == 0 || z == SIZE_Z-1) &&
-					    ! TEST_FLAG( grid, x, y, z, OBSTACLE ))
-						SET_FLAG( grid, x, y, z, IN_OUT_FLOW );
-				}
-			}
-		}
-	}
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+
+          if ((z == 0 || z == SIZE_Z - 1) &&
+              !TEST_FLAG(grid, x, y, z, OBSTACLE))
+            SET_FLAG(grid, x, y, z, IN_OUT_FLOW);
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ) {
-	SWEEP_VAR
+void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid) {
+  SWEEP_VAR
 
-	float ux, uy, uz, u2, rho;
+  float ux, uy, uz, u2, rho;
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( ux, uy, uz, u2, rho )
+#pragma omp parallel for private(ux, uy, uz, u2, rho)
 #endif
 #endif
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		if( TEST_FLAG_SWEEP( srcGrid, OBSTACLE )) {
-			DST_C ( dstGrid ) = SRC_C ( srcGrid );
-			DST_S ( dstGrid ) = SRC_N ( srcGrid );
-			DST_N ( dstGrid ) = SRC_S ( srcGrid );
-			DST_W ( dstGrid ) = SRC_E ( srcGrid );
-			DST_E ( dstGrid ) = SRC_W ( srcGrid );
-			DST_B ( dstGrid ) = SRC_T ( srcGrid );
-			DST_T ( dstGrid ) = SRC_B ( srcGrid );
-			DST_SW( dstGrid ) = SRC_NE( srcGrid );
-			DST_SE( dstGrid ) = SRC_NW( srcGrid );
-			DST_NW( dstGrid ) = SRC_SE( srcGrid );
-			DST_NE( dstGrid ) = SRC_SW( srcGrid );
-			DST_SB( dstGrid ) = SRC_NT( srcGrid );
-			DST_ST( dstGrid ) = SRC_NB( srcGrid );
-			DST_NB( dstGrid ) = SRC_ST( srcGrid );
-			DST_NT( dstGrid ) = SRC_SB( srcGrid );
-			DST_WB( dstGrid ) = SRC_ET( srcGrid );
-			DST_WT( dstGrid ) = SRC_EB( srcGrid );
-			DST_EB( dstGrid ) = SRC_WT( srcGrid );
-			DST_ET( dstGrid ) = SRC_WB( srcGrid );
-			continue;
-		}
-
-		rho = + SRC_C ( srcGrid ) + SRC_N ( srcGrid )
-		      + SRC_S ( srcGrid ) + SRC_E ( srcGrid )
-		      + SRC_W ( srcGrid ) + SRC_T ( srcGrid )
-		      + SRC_B ( srcGrid ) + SRC_NE( srcGrid )
-		      + SRC_NW( srcGrid ) + SRC_SE( srcGrid )
-		      + SRC_SW( srcGrid ) + SRC_NT( srcGrid )
-		      + SRC_NB( srcGrid ) + SRC_ST( srcGrid )
-		      + SRC_SB( srcGrid ) + SRC_ET( srcGrid )
-		      + SRC_EB( srcGrid ) + SRC_WT( srcGrid )
-		      + SRC_WB( srcGrid );
-
-		ux = + SRC_E ( srcGrid ) - SRC_W ( srcGrid )
-		     + SRC_NE( srcGrid ) - SRC_NW( srcGrid )
-		     + SRC_SE( srcGrid ) - SRC_SW( srcGrid )
-		     + SRC_ET( srcGrid ) + SRC_EB( srcGrid )
-		     - SRC_WT( srcGrid ) - SRC_WB( srcGrid );
-		uy = + SRC_N ( srcGrid ) - SRC_S ( srcGrid )
-		     + SRC_NE( srcGrid ) + SRC_NW( srcGrid )
-		     - SRC_SE( srcGrid ) - SRC_SW( srcGrid )
-		     + SRC_NT( srcGrid ) + SRC_NB( srcGrid )
-		     - SRC_ST( srcGrid ) - SRC_SB( srcGrid );
-		uz = + SRC_T ( srcGrid ) - SRC_B ( srcGrid )
-		     + SRC_NT( srcGrid ) - SRC_NB( srcGrid )
-		     + SRC_ST( srcGrid ) - SRC_SB( srcGrid )
-		     + SRC_ET( srcGrid ) - SRC_EB( srcGrid )
-		     + SRC_WT( srcGrid ) - SRC_WB( srcGrid );
-
-		ux /= rho;
-		uy /= rho;
-		uz /= rho;
-
-		if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) {
-			ux = 0.005f;
-			uy = 0.002f;
-			uz = 0.000f;
-		}
-
-		u2 = 1.5f * (ux*ux + uy*uy + uz*uz);
-		DST_C ( dstGrid ) = (1.0f-OMEGA)*SRC_C ( srcGrid ) + DFL1*OMEGA*rho*(1.0f                                 - u2);
-
-		DST_N ( dstGrid ) = (1.0f-OMEGA)*SRC_N ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uy*(4.5f*uy       + 3.0f) - u2);
-		DST_S ( dstGrid ) = (1.0f-OMEGA)*SRC_S ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uy*(4.5f*uy       - 3.0f) - u2);
-		DST_E ( dstGrid ) = (1.0f-OMEGA)*SRC_E ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       ux*(4.5f*ux       + 3.0f) - u2);
-		DST_W ( dstGrid ) = (1.0f-OMEGA)*SRC_W ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       ux*(4.5f*ux       - 3.0f) - u2);
-		DST_T ( dstGrid ) = (1.0f-OMEGA)*SRC_T ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uz*(4.5f*uz       + 3.0f) - u2);
-		DST_B ( dstGrid ) = (1.0f-OMEGA)*SRC_B ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uz*(4.5f*uz       - 3.0f) - u2);
-
-		DST_NE( dstGrid ) = (1.0f-OMEGA)*SRC_NE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
-		DST_NW( dstGrid ) = (1.0f-OMEGA)*SRC_NW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
-		DST_SE( dstGrid ) = (1.0f-OMEGA)*SRC_SE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
-		DST_SW( dstGrid ) = (1.0f-OMEGA)*SRC_SW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
-		DST_NT( dstGrid ) = (1.0f-OMEGA)*SRC_NT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
-		DST_NB( dstGrid ) = (1.0f-OMEGA)*SRC_NB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
-		DST_ST( dstGrid ) = (1.0f-OMEGA)*SRC_ST( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
-		DST_SB( dstGrid ) = (1.0f-OMEGA)*SRC_SB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
-		DST_ET( dstGrid ) = (1.0f-OMEGA)*SRC_ET( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
-		DST_EB( dstGrid ) = (1.0f-OMEGA)*SRC_EB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
-		DST_WT( dstGrid ) = (1.0f-OMEGA)*SRC_WT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
-		DST_WB( dstGrid ) = (1.0f-OMEGA)*SRC_WB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
-	SWEEP_END
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  if (TEST_FLAG_SWEEP(srcGrid, OBSTACLE)) {
+    DST_C(dstGrid) = SRC_C(srcGrid);
+    DST_S(dstGrid) = SRC_N(srcGrid);
+    DST_N(dstGrid) = SRC_S(srcGrid);
+    DST_W(dstGrid) = SRC_E(srcGrid);
+    DST_E(dstGrid) = SRC_W(srcGrid);
+    DST_B(dstGrid) = SRC_T(srcGrid);
+    DST_T(dstGrid) = SRC_B(srcGrid);
+    DST_SW(dstGrid) = SRC_NE(srcGrid);
+    DST_SE(dstGrid) = SRC_NW(srcGrid);
+    DST_NW(dstGrid) = SRC_SE(srcGrid);
+    DST_NE(dstGrid) = SRC_SW(srcGrid);
+    DST_SB(dstGrid) = SRC_NT(srcGrid);
+    DST_ST(dstGrid) = SRC_NB(srcGrid);
+    DST_NB(dstGrid) = SRC_ST(srcGrid);
+    DST_NT(dstGrid) = SRC_SB(srcGrid);
+    DST_WB(dstGrid) = SRC_ET(srcGrid);
+    DST_WT(dstGrid) = SRC_EB(srcGrid);
+    DST_EB(dstGrid) = SRC_WT(srcGrid);
+    DST_ET(dstGrid) = SRC_WB(srcGrid);
+    continue;
+  }
+
+  rho = +SRC_C(srcGrid) + SRC_N(srcGrid) + SRC_S(srcGrid) + SRC_E(srcGrid) +
+        SRC_W(srcGrid) + SRC_T(srcGrid) + SRC_B(srcGrid) + SRC_NE(srcGrid) +
+        SRC_NW(srcGrid) + SRC_SE(srcGrid) + SRC_SW(srcGrid) + SRC_NT(srcGrid) +
+        SRC_NB(srcGrid) + SRC_ST(srcGrid) + SRC_SB(srcGrid) + SRC_ET(srcGrid) +
+        SRC_EB(srcGrid) + SRC_WT(srcGrid) + SRC_WB(srcGrid);
+
+  ux = +SRC_E(srcGrid) - SRC_W(srcGrid) + SRC_NE(srcGrid) - SRC_NW(srcGrid) +
+       SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_ET(srcGrid) + SRC_EB(srcGrid) -
+       SRC_WT(srcGrid) - SRC_WB(srcGrid);
+  uy = +SRC_N(srcGrid) - SRC_S(srcGrid) + SRC_NE(srcGrid) + SRC_NW(srcGrid) -
+       SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_NT(srcGrid) + SRC_NB(srcGrid) -
+       SRC_ST(srcGrid) - SRC_SB(srcGrid);
+  uz = +SRC_T(srcGrid) - SRC_B(srcGrid) + SRC_NT(srcGrid) - SRC_NB(srcGrid) +
+       SRC_ST(srcGrid) - SRC_SB(srcGrid) + SRC_ET(srcGrid) - SRC_EB(srcGrid) +
+       SRC_WT(srcGrid) - SRC_WB(srcGrid);
+
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (TEST_FLAG_SWEEP(srcGrid, ACCEL)) {
+    ux = 0.005f;
+    uy = 0.002f;
+    uz = 0.000f;
+  }
+
+  u2 = 1.5f * (ux * ux + uy * uy + uz * uz);
+  DST_C(dstGrid) =
+      (1.0f - OMEGA) * SRC_C(srcGrid) + DFL1 * OMEGA * rho * (1.0f - u2);
+
+  DST_N(dstGrid) = (1.0f - OMEGA) * SRC_N(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy + 3.0f) - u2);
+  DST_S(dstGrid) = (1.0f - OMEGA) * SRC_S(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy - 3.0f) - u2);
+  DST_E(dstGrid) = (1.0f - OMEGA) * SRC_E(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux + 3.0f) - u2);
+  DST_W(dstGrid) = (1.0f - OMEGA) * SRC_W(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux - 3.0f) - u2);
+  DST_T(dstGrid) = (1.0f - OMEGA) * SRC_T(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz + 3.0f) - u2);
+  DST_B(dstGrid) = (1.0f - OMEGA) * SRC_B(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz - 3.0f) - u2);
+
+  DST_NE(dstGrid) = (1.0f - OMEGA) * SRC_NE(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux + uy) * (4.5f * (+ux + uy) + 3.0f) - u2);
+  DST_NW(dstGrid) = (1.0f - OMEGA) * SRC_NW(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux + uy) * (4.5f * (-ux + uy) + 3.0f) - u2);
+  DST_SE(dstGrid) = (1.0f - OMEGA) * SRC_SE(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux - uy) * (4.5f * (+ux - uy) + 3.0f) - u2);
+  DST_SW(dstGrid) = (1.0f - OMEGA) * SRC_SW(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux - uy) * (4.5f * (-ux - uy) + 3.0f) - u2);
+  DST_NT(dstGrid) = (1.0f - OMEGA) * SRC_NT(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+uy + uz) * (4.5f * (+uy + uz) + 3.0f) - u2);
+  DST_NB(dstGrid) = (1.0f - OMEGA) * SRC_NB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+uy - uz) * (4.5f * (+uy - uz) + 3.0f) - u2);
+  DST_ST(dstGrid) = (1.0f - OMEGA) * SRC_ST(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-uy + uz) * (4.5f * (-uy + uz) + 3.0f) - u2);
+  DST_SB(dstGrid) = (1.0f - OMEGA) * SRC_SB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-uy - uz) * (4.5f * (-uy - uz) + 3.0f) - u2);
+  DST_ET(dstGrid) = (1.0f - OMEGA) * SRC_ET(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux + uz) * (4.5f * (+ux + uz) + 3.0f) - u2);
+  DST_EB(dstGrid) = (1.0f - OMEGA) * SRC_EB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux - uz) * (4.5f * (+ux - uz) + 3.0f) - u2);
+  DST_WT(dstGrid) = (1.0f - OMEGA) * SRC_WT(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux + uz) * (4.5f * (-ux + uz) + 3.0f) - u2);
+  DST_WB(dstGrid) = (1.0f - OMEGA) * SRC_WB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux - uz) * (4.5f * (-ux - uz) + 3.0f) - u2);
+  SWEEP_END
 }
 
 /*############################################################################*/
 
-void LBM_handleInOutFlow( LBM_Grid srcGrid ) {
-	float ux , uy , uz , rho ,
-	       ux1, uy1, uz1, rho1,
-	       ux2, uy2, uz2, rho2,
-	       u2, px, py;
-	SWEEP_VAR
+void LBM_handleInOutFlow(LBM_Grid srcGrid) {
+  float ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, uy2, uz2, rho2, u2, px, py;
+  SWEEP_VAR
 
-	/* inflow */
-	/*voption indep*/
+  /* inflow */
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \
-                                  ux2, uy2, uz2, rho2, u2, px, py )
+#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2,    \
+                                 uy2, uz2, rho2, u2, px, py)
 #endif
 #endif
-	SWEEP_START( 0, 0, 0, 0, 0, 1 )
-		rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WB );
-		rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WB );
-
-		rho = 2.0*rho1 - rho2;
-
-		px = (SWEEP_X / (0.5*(SIZE_X-1))) - 1.0;
-		py = (SWEEP_Y / (0.5*(SIZE_Y-1))) - 1.0;
-		ux = 0.00;
-		uy = 0.00;
-		uz = 0.01 * (1.0-px*px) * (1.0-py*py);
-
-		u2 = 1.5 * (ux*ux + uy*uy + uz*uz);
-
-		LOCAL( srcGrid, C ) = DFL1*rho*(1.0                                 - u2);
-
-		LOCAL( srcGrid, N ) = DFL2*rho*(1.0 +       uy*(4.5*uy       + 3.0) - u2);
-		LOCAL( srcGrid, S ) = DFL2*rho*(1.0 +       uy*(4.5*uy       - 3.0) - u2);
-		LOCAL( srcGrid, E ) = DFL2*rho*(1.0 +       ux*(4.5*ux       + 3.0) - u2);
-		LOCAL( srcGrid, W ) = DFL2*rho*(1.0 +       ux*(4.5*ux       - 3.0) - u2);
-		LOCAL( srcGrid, T ) = DFL2*rho*(1.0 +       uz*(4.5*uz       + 3.0) - u2);
-		LOCAL( srcGrid, B ) = DFL2*rho*(1.0 +       uz*(4.5*uz       - 3.0) - u2);
-
-		LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2);
-		LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2);
-	SWEEP_END
-
-	/* outflow */
-	/*voption indep*/
+  SWEEP_START(0, 0, 0, 0, 0, 1)
+  rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WB);
+  rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WB);
+
+  rho = 2.0 * rho1 - rho2;
+
+  px = (SWEEP_X / (0.5 * (SIZE_X - 1))) - 1.0;
+  py = (SWEEP_Y / (0.5 * (SIZE_Y - 1))) - 1.0;
+  ux = 0.00;
+  uy = 0.00;
+  uz = 0.01 * (1.0 - px * px) * (1.0 - py * py);
+
+  u2 = 1.5 * (ux * ux + uy * uy + uz * uz);
+
+  LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2);
+
+  LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2);
+  LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2);
+  LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2);
+  LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2);
+  LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2);
+  LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2);
+
+  LOCAL(srcGrid, NE) =
+      DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, NW) =
+      DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, SE) =
+      DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, SW) =
+      DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, NT) =
+      DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, NB) =
+      DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ST) =
+      DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, SB) =
+      DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ET) =
+      DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, EB) =
+      DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2);
+  LOCAL(srcGrid, WT) =
+      DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, WB) =
+      DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2);
+  SWEEP_END
+
+  /* outflow */
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \
-                                  ux2, uy2, uz2, rho2, u2, px, py )
+#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2,    \
+                                 uy2, uz2, rho2, u2, px, py)
 #endif
 #endif
 
-	SWEEP_START( 0, 0, SIZE_Z-1, 0, 0, SIZE_Z )
-		rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB );
-		ux1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB );
-		uy1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB );
-		uz1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB );
-
-		ux1 /= rho1;
-		uy1 /= rho1;
-		uz1 /= rho1;
-
-		rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB );
-		ux2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB );
-		uy2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB );
-		uz2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB );
-
-		ux2 /= rho2;
-		uy2 /= rho2;
-		uz2 /= rho2;
-
-		rho = 1.0;
-
-		ux = 2*ux1 - ux2;
-		uy = 2*uy1 - uy2;
-		uz = 2*uz1 - uz2;
-
-		u2 = 1.5 * (ux*ux + uy*uy + uz*uz);
-
-		LOCAL( srcGrid, C ) = DFL1*rho*(1.0                                 - u2);
-
-		LOCAL( srcGrid, N ) = DFL2*rho*(1.0 +       uy*(4.5*uy       + 3.0) - u2);
-		LOCAL( srcGrid, S ) = DFL2*rho*(1.0 +       uy*(4.5*uy       - 3.0) - u2);
-		LOCAL( srcGrid, E ) = DFL2*rho*(1.0 +       ux*(4.5*ux       + 3.0) - u2);
-		LOCAL( srcGrid, W ) = DFL2*rho*(1.0 +       ux*(4.5*ux       - 3.0) - u2);
-		LOCAL( srcGrid, T ) = DFL2*rho*(1.0 +       uz*(4.5*uz       + 3.0) - u2);
-		LOCAL( srcGrid, B ) = DFL2*rho*(1.0 +       uz*(4.5*uz       - 3.0) - u2);
-
-		LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2);
-		LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2);
-	SWEEP_END
+  SWEEP_START(0, 0, SIZE_Z - 1, 0, 0, SIZE_Z)
+  rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB);
+  ux1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB);
+  uy1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB);
+  uz1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB);
+
+  ux1 /= rho1;
+  uy1 /= rho1;
+  uz1 /= rho1;
+
+  rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB);
+  ux2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB);
+  uy2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB);
+  uz2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB);
+
+  ux2 /= rho2;
+  uy2 /= rho2;
+  uz2 /= rho2;
+
+  rho = 1.0;
+
+  ux = 2 * ux1 - ux2;
+  uy = 2 * uy1 - uy2;
+  uz = 2 * uz1 - uz2;
+
+  u2 = 1.5 * (ux * ux + uy * uy + uz * uz);
+
+  LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2);
+
+  LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2);
+  LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2);
+  LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2);
+  LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2);
+  LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2);
+  LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2);
+
+  LOCAL(srcGrid, NE) =
+      DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, NW) =
+      DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, SE) =
+      DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, SW) =
+      DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, NT) =
+      DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, NB) =
+      DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ST) =
+      DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, SB) =
+      DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ET) =
+      DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, EB) =
+      DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2);
+  LOCAL(srcGrid, WT) =
+      DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, WB) =
+      DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2);
+  SWEEP_END
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = + LOCAL( grid, C  ) + LOCAL( grid, N  )
-		      + LOCAL( grid, S  ) + LOCAL( grid, E  )
-		      + LOCAL( grid, W  ) + LOCAL( grid, T  )
-		      + LOCAL( grid, B  ) + LOCAL( grid, NE )
-		      + LOCAL( grid, NW ) + LOCAL( grid, SE )
-		      + LOCAL( grid, SW ) + LOCAL( grid, NT )
-		      + LOCAL( grid, NB ) + LOCAL( grid, ST )
-		      + LOCAL( grid, SB ) + LOCAL( grid, ET )
-		      + LOCAL( grid, EB ) + LOCAL( grid, WT )
-		      + LOCAL( grid, WB );
-		if( rho < minRho ) minRho = rho;
-		if( rho > maxRho ) maxRho = rho;
-		mass += rho;
-
-		if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-			nObstacleCells++;
-		}
-		else {
-			if( TEST_FLAG_SWEEP( grid, ACCEL ))
-				nAccelCells++;
-			else
-				nFluidCells++;
-
-			ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			     + LOCAL( grid, NE ) - LOCAL( grid, NW )
-			     + LOCAL( grid, SE ) - LOCAL( grid, SW )
-			     + LOCAL( grid, ET ) + LOCAL( grid, EB )
-			     - LOCAL( grid, WT ) - LOCAL( grid, WB );
-			uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			     + LOCAL( grid, NE ) + LOCAL( grid, NW )
-			     - LOCAL( grid, SE ) - LOCAL( grid, SW )
-			     + LOCAL( grid, NT ) + LOCAL( grid, NB )
-			     - LOCAL( grid, ST ) - LOCAL( grid, SB );
-			uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			     + LOCAL( grid, NT ) - LOCAL( grid, NB )
-			     + LOCAL( grid, ST ) - LOCAL( grid, SB )
-			     + LOCAL( grid, ET ) - LOCAL( grid, EB )
-			     + LOCAL( grid, WT ) - LOCAL( grid, WB );
-			u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-			if( u2 < minU2 ) minU2 = u2;
-			if( u2 > maxU2 ) maxU2 = u2;
-		}
-	SWEEP_END
-
-        printf( "LBM_showGridStatistics:\n"
-        "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-        "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-        "\tminU: %e maxU: %e\n\n",
-        nObstacleCells, nAccelCells, nFluidCells,
-        minRho, maxRho, mass,
-        sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = +LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                             const int binary ) {
-	int x, y, z;
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				rho = + GRID_ENTRY( grid, x, y, z, C  ) + GRID_ENTRY( grid, x, y, z, N  )
-				      + GRID_ENTRY( grid, x, y, z, S  ) + GRID_ENTRY( grid, x, y, z, E  )
-				      + GRID_ENTRY( grid, x, y, z, W  ) + GRID_ENTRY( grid, x, y, z, T  )
-				      + GRID_ENTRY( grid, x, y, z, B  ) + GRID_ENTRY( grid, x, y, z, NE )
-				      + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE )
-				      + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT )
-				      + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST )
-				      + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET )
-				      + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT )
-				      + GRID_ENTRY( grid, x, y, z, WB );
-				ux = + GRID_ENTRY( grid, x, y, z, E  ) - GRID_ENTRY( grid, x, y, z, W  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) 
-				     + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) 
-				     - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				uy = + GRID_ENTRY( grid, x, y, z, N  ) - GRID_ENTRY( grid, x, y, z, S  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) 
-				     - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) 
-				     - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB );
-				uz = + GRID_ENTRY( grid, x, y, z, T  ) - GRID_ENTRY( grid, x, y, z, B  ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) 
-				     + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) 
-				     + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					fwrite( &ux, sizeof( ux ), 1, file );
-					fwrite( &uy, sizeof( uy ), 1, file );
-					fwrite( &uz, sizeof( uz ), 1, file );
-					*/
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-			}
-		}
-	}
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  int x, y, z;
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) +
+              GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) +
+              GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) +
+              GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) +
+              GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) +
+              GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) +
+              GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) +
+              GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) +
+              GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) +
+              GRID_ENTRY(grid, x, y, z, WB);
+        ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) +
+             GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) +
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) -
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) +
+             GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) -
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) -
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB);
+        uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) +
+             GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) +
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) +
+             GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) +
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        ux /= rho;
+        uy /= rho;
+        uz /= rho;
+
+        if (binary) {
+          /*
+          fwrite( &ux, sizeof( ux ), 1, file );
+          fwrite( &uy, sizeof( uy ), 1, file );
+          fwrite( &uz, sizeof( uz ), 1, file );
+          */
+          storeValue(file, &ux);
+          storeValue(file, &uy);
+          storeValue(file, &uz);
+        } else
+          fprintf(file, "%e %e %e\n", ux, uy, uz);
+      }
+    }
+  }
+
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_compareVelocityField( LBM_Grid grid, const char* filename,
-                             const int binary ) {
-	int x, y, z;
-	float rho, ux, uy, uz;
-	OUTPUT_PRECISION fileUx, fileUy, fileUz,
-	                 dUx, dUy, dUz,
-	                 diff2, maxDiff2 = -1e+30;
-
-	FILE* file = fopen( filename, (binary ? "rb" : "r") );
-
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				rho = + GRID_ENTRY( grid, x, y, z, C  ) + GRID_ENTRY( grid, x, y, z, N  )
-				      + GRID_ENTRY( grid, x, y, z, S  ) + GRID_ENTRY( grid, x, y, z, E  )
-				      + GRID_ENTRY( grid, x, y, z, W  ) + GRID_ENTRY( grid, x, y, z, T  )
-				      + GRID_ENTRY( grid, x, y, z, B  ) + GRID_ENTRY( grid, x, y, z, NE )
-				      + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE )
-				      + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT )
-				      + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST )
-				      + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET )
-				      + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT )
-				      + GRID_ENTRY( grid, x, y, z, WB );
-				ux = + GRID_ENTRY( grid, x, y, z, E  ) - GRID_ENTRY( grid, x, y, z, W  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) 
-				     + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) 
-				     - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				uy = + GRID_ENTRY( grid, x, y, z, N  ) - GRID_ENTRY( grid, x, y, z, S  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) 
-				     - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) 
-				     - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB );
-				uz = + GRID_ENTRY( grid, x, y, z, T  ) - GRID_ENTRY( grid, x, y, z, B  ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) 
-				     + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) 
-				     + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					loadValue( file, &fileUx );
-					loadValue( file, &fileUy );
-					loadValue( file, &fileUz );
-				}
-				else {
-					if( sizeof( OUTPUT_PRECISION ) == sizeof( double )) {
-						fscanf( file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz );
-					}
-					else {
-						fscanf( file, "%f %f %f\n", &fileUx, &fileUy, &fileUz );
-					}
-				}
-
-				dUx = ux - fileUx;
-				dUy = uy - fileUy;
-				dUz = uz - fileUz;
-				diff2 = dUx*dUx + dUy*dUy + dUz*dUz;
-				if( diff2 > maxDiff2 ) maxDiff2 = diff2;
-			}
-		}
-	}
+void LBM_compareVelocityField(LBM_Grid grid, const char *filename,
+                              const int binary) {
+  int x, y, z;
+  float rho, ux, uy, uz;
+  OUTPUT_PRECISION fileUx, fileUy, fileUz, dUx, dUy, dUz, diff2,
+      maxDiff2 = -1e+30;
+
+  FILE *file = fopen(filename, (binary ? "rb" : "r"));
+
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) +
+              GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) +
+              GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) +
+              GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) +
+              GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) +
+              GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) +
+              GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) +
+              GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) +
+              GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) +
+              GRID_ENTRY(grid, x, y, z, WB);
+        ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) +
+             GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) +
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) -
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) +
+             GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) -
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) -
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB);
+        uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) +
+             GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) +
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) +
+             GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) +
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        ux /= rho;
+        uy /= rho;
+        uz /= rho;
+
+        if (binary) {
+          loadValue(file, &fileUx);
+          loadValue(file, &fileUy);
+          loadValue(file, &fileUz);
+        } else {
+          if (sizeof(OUTPUT_PRECISION) == sizeof(double)) {
+            fscanf(file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz);
+          } else {
+            fscanf(file, "%f %f %f\n", &fileUx, &fileUy, &fileUz);
+          }
+        }
+
+        dUx = ux - fileUx;
+        dUy = uy - fileUy;
+        dUz = uz - fileUz;
+        diff2 = dUx * dUx + dUy * dUy + dUz * dUz;
+        if (diff2 > maxDiff2)
+          maxDiff2 = diff2;
+      }
+    }
+  }
 
 #if defined(SPEC_CPU)
-	printf( "LBM_compareVelocityField: maxDiff = %e  \n\n",
-	        sqrt( maxDiff2 )  );
+  printf("LBM_compareVelocityField: maxDiff = %e  \n\n", sqrt(maxDiff2));
 #else
-	printf( "LBM_compareVelocityField: maxDiff = %e  ==>  %s\n\n",
-	        sqrt( maxDiff2 ),
-	        sqrt( maxDiff2 ) > 1e-5 ? "##### ERROR #####" : "OK" );
+  printf("LBM_compareVelocityField: maxDiff = %e  ==>  %s\n\n", sqrt(maxDiff2),
+         sqrt(maxDiff2) > 1e-5 ? "##### ERROR #####" : "OK");
 #endif
-	fclose( file );
+  fclose(file);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h
index e35818c0b300593f382a61131e7a35584d35cee1..94189f0f2bcc080ed79e42941b5a0638649d46e3 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm.h
@@ -18,30 +18,31 @@ typedef enum {C = 0,
               NT, NB, ST, SB,
               ET, EB, WT, WB,
               FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
-	      */
+              */
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
-
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_initializeSpecialCellsForChannel( LBM_Grid grid );
-void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 );
-void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid );
-void LBM_handleInOutFlow( LBM_Grid srcGrid );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
-void LBM_compareVelocityField( LBM_Grid grid, const char* filename,
-                             const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_initializeSpecialCellsForChannel(LBM_Grid grid);
+void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2);
+void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid);
+void LBM_handleInOutFlow(LBM_Grid srcGrid);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
+void LBM_compareVelocityField(LBM_Grid grid, const char *filename,
+                              const BOOL binary);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h
index 42c999e204dffc83c1affe8d56e086dcf1815b43..92b4c1b21dc9d87531691b3fce4bd1ff01b201f8 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/lbm_1d_array.h
@@ -3,163 +3,204 @@
 #ifndef _LBM_MACROS_H_
 #define _LBM_MACROS_H_
 
-typedef enum {C = 0,
-    N, S, E, W, T, B,
-    NE, NW, SE, SW,
-    NT, NB, ST, SB,
-    ET, EB, WT, WB,
-    FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
-#define SIZE   (120)
-#define SIZE_X (1*SIZE)
-#define SIZE_Y (1*SIZE)
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
+#define SIZE (120)
+#define SIZE_X (1 * SIZE)
+#define SIZE_Y (1 * SIZE)
 #define SIZE_Z (150)
 /*############################################################################*/
 
-typedef float LBM_Grid[SIZE_Z*SIZE_Y*SIZE_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float LBM_Grid[SIZE_Z * SIZE_Y * SIZE_X * N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \
-						 (y)*SIZE_X+(z)*SIZE_X*SIZE_Y))
+#define CALC_INDEX(x, y, z, e)                                                 \
+  ((e) + N_CELL_ENTRIES * ((x) + (y)*SIZE_X + (z)*SIZE_X * SIZE_Y))
 
 #define SWEEP_VAR int i;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-  for( i = CALC_INDEX(x1, y1, z1, 0); \
-       i < CALC_INDEX(x2, y2, z2, 0); \
-       i += N_CELL_ENTRIES ) {
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (i = CALC_INDEX(x1, y1, z1, 0); i < CALC_INDEX(x2, y2, z2, 0);           \
+       i += N_CELL_ENTRIES) {
 
 #define SWEEP_END }
 
-#define SWEEP_X  ((i / N_CELL_ENTRIES) % SIZE_X)
+#define SWEEP_X ((i / N_CELL_ENTRIES) % SIZE_X)
 #define SWEEP_Y (((i / N_CELL_ENTRIES) / SIZE_X) % SIZE_Y)
-#define SWEEP_Z  ((i / N_CELL_ENTRIES) / (SIZE_X*SIZE_Y))
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_Z ((i / N_CELL_ENTRIES) / (SIZE_X * SIZE_Y))
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX(dx, dy, dz, e) + (i)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #define COLLIDE_STREAM
 #ifdef COLLIDE_STREAM
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* COLLIDE_STREAM */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* COLLIDE_STREAM */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* const _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *const _aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c
index 85600dbfdf20059a71694b7ae72f0243ee5c82eb..6985e3e58b300a7fad88ed4623340562693c80bd 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.c
@@ -8,10 +8,10 @@
 #include <stdlib.h>
 
 #if defined(SPEC_CPU)
-#   include <time.h>
+#include <time.h>
 #else
-#   include <sys/times.h>
-#   include <unistd.h>
+#include <sys/times.h>
+#include <unistd.h>
 #endif
 
 #include <sys/stat.h>
@@ -23,168 +23,169 @@ static LBM_GridPtr srcGrid, dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-	MAIN_Param param;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
 #if !defined(SPEC_CPU)
-	MAIN_Time time;
+  MAIN_Time time;
 #endif
-	int t;
+  int t;
 
-        pb_InitializeTimerSet(&timers);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        struct pb_Parameters* params;
-        params = pb_ReadParameters(&nArgs, arg);
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
-	MAIN_parseCommandLine( nArgs, arg, &param, params );
-	MAIN_printInfo( &param );
-	MAIN_initialize( &param );
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
+  MAIN_initialize(&param);
 #if !defined(SPEC_CPU)
-	MAIN_startClock( &time );
+  MAIN_startClock(&time);
 #endif
 
-	for( t = 1; t <= param.nTimeSteps; t++ ) {
-		if( param.simType == CHANNEL ) {
-			LBM_handleInOutFlow( *srcGrid );
-		}
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    if (param.simType == CHANNEL) {
+      LBM_handleInOutFlow(*srcGrid);
+    }
 
-		LBM_performStreamCollide( *srcGrid, *dstGrid );
-		LBM_swapGrids( &srcGrid, &dstGrid );
+    LBM_performStreamCollide(*srcGrid, *dstGrid);
+    LBM_swapGrids(&srcGrid, &dstGrid);
 
-		if( (t & 63) == 0 ) {
-			printf( "timestep: %i\n", t );
-			//LBM_showGridStatistics( *srcGrid );
-		}
-	}
+    if ((t & 63) == 0) {
+      printf("timestep: %i\n", t);
+      // LBM_showGridStatistics( *srcGrid );
+    }
+  }
 
 #if !defined(SPEC_CPU)
-	MAIN_stopClock( &time, &param );
+  MAIN_stopClock(&time, &param);
 #endif
 
-	MAIN_finalize( &param );
+  MAIN_finalize(&param);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        pb_PrintTimerSet(&timers);
-        pb_FreeParameters(params);
-	return 0;
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params) {
-	struct stat fileStat;
-	
-	if( nArgs < 2 ) {
-		printf( "syntax: lbm <time steps>\n" );
-		exit( 1 );
-	}
-
-	param->nTimeSteps     = atoi( arg[1] );
-
-	if( params->inpFiles[0] != NULL ) {
-		param->obstacleFilename = params->inpFiles[0];
-
-		if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-			printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-			         param->obstacleFilename );
-			exit( 1 );
-		}
-		if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-			printf( "MAIN_parseCommandLine:\n"
-			        "\tsize of file '%s' is %i bytes\n"
-					    "\texpected size is %i bytes\n",
-			        param->obstacleFilename, (int) fileStat.st_size,
-			        SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-			exit( 1 );
-		}
-	}
-	else param->obstacleFilename = NULL;
-
-	param->resultFilename = params->outFile;
-	param->action         = STORE;
-	param->simType        = LDC;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
+
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
+    }
+  } else
+    param->obstacleFilename = NULL;
+
+  param->resultFilename = params->outFile;
+  param->action = STORE;
+  param->simType = LDC;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-	const char actionString[3][32] = {"nothing", "compare", "store"};
-	const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"};
-	printf( "MAIN_printInfo:\n"
-	        "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-	        "\tnTimeSteps     : %i\n"
-	        "\tresult file    : %s\n"
-	        "\taction         : %s\n"
-	        "\tsimulation type: %s\n"
-	        "\tobstacle file  : %s\n\n",
-	        SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-	        param->nTimeSteps, param->resultFilename, 
-	        actionString[param->action], simTypeString[param->simType],
-	        (param->obstacleFilename == NULL) ? "<none>" :
-	                                            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  const char actionString[3][32] = {"nothing", "compare", "store"};
+  const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"};
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, actionString[param->action],
+         simTypeString[param->simType],
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param ) {
-	LBM_allocateGrid( (float**) &srcGrid );
-	LBM_allocateGrid( (float**) &dstGrid );
+void MAIN_initialize(const MAIN_Param *param) {
+  LBM_allocateGrid((float **)&srcGrid);
+  LBM_allocateGrid((float **)&dstGrid);
 
-	LBM_initializeGrid( *srcGrid );
-	LBM_initializeGrid( *dstGrid );
+  LBM_initializeGrid(*srcGrid);
+  LBM_initializeGrid(*dstGrid);
 
-	if( param->obstacleFilename != NULL ) {
-		LBM_loadObstacleFile( *srcGrid, param->obstacleFilename );
-		LBM_loadObstacleFile( *dstGrid, param->obstacleFilename );
-	}
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(*srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(*dstGrid, param->obstacleFilename);
+  }
 
-	if( param->simType == CHANNEL ) {
-		LBM_initializeSpecialCellsForChannel( *srcGrid );
-		LBM_initializeSpecialCellsForChannel( *dstGrid );
-	}
-	else {
-		LBM_initializeSpecialCellsForLDC( *srcGrid );
-		LBM_initializeSpecialCellsForLDC( *dstGrid );
-	}
+  if (param->simType == CHANNEL) {
+    LBM_initializeSpecialCellsForChannel(*srcGrid);
+    LBM_initializeSpecialCellsForChannel(*dstGrid);
+  } else {
+    LBM_initializeSpecialCellsForLDC(*srcGrid);
+    LBM_initializeSpecialCellsForLDC(*dstGrid);
+  }
 
-	LBM_showGridStatistics( *srcGrid );
+  LBM_showGridStatistics(*srcGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param ) {
-	LBM_showGridStatistics( *srcGrid );
+void MAIN_finalize(const MAIN_Param *param) {
+  LBM_showGridStatistics(*srcGrid);
 
-	if( param->action == COMPARE )
-		LBM_compareVelocityField( *srcGrid, param->resultFilename, TRUE );
-	if( param->action == STORE )
-	LBM_storeVelocityField( *srcGrid, param->resultFilename, TRUE );
+  if (param->action == COMPARE)
+    LBM_compareVelocityField(*srcGrid, param->resultFilename, TRUE);
+  if (param->action == STORE)
+    LBM_storeVelocityField(*srcGrid, param->resultFilename, TRUE);
 
-	LBM_freeGrid( (float**) &srcGrid );
-	LBM_freeGrid( (float**) &dstGrid );
+  LBM_freeGrid((float **)&srcGrid);
+  LBM_freeGrid((float **)&dstGrid);
 }
 
 #if !defined(SPEC_CPU)
 /*############################################################################*/
 
-void MAIN_startClock( MAIN_Time* time ) {
-	time->timeScale = 1.0 / sysconf( _SC_CLK_TCK );
-	time->tickStart = times( &(time->timeStart) );
+void MAIN_startClock(MAIN_Time *time) {
+  time->timeScale = 1.0 / sysconf(_SC_CLK_TCK);
+  time->tickStart = times(&(time->timeStart));
 }
 
-
 /*############################################################################*/
 
-void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param ) {
-	time->tickStop = times( &(time->timeStop) );
-
-	printf( "MAIN_stopClock:\n"
-	        "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n",
-	        (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale,
-	        (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale,
-	        (time->timeStop.tms_utime - time->timeStart.tms_utime +
-	         time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale,
-	        (time->tickStop           - time->tickStart          ) * time->timeScale,
-	        1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps /
-	        (time->tickStop           - time->tickStart          ) / time->timeScale );
+void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param) {
+  time->tickStop = times(&(time->timeStop));
+
+  printf(
+      "MAIN_stopClock:\n"
+      "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n",
+      (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale,
+      (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale,
+      (time->timeStop.tms_utime - time->timeStart.tms_utime +
+       time->timeStop.tms_stime - time->timeStart.tms_stime) *
+          time->timeScale,
+      (time->tickStop - time->tickStart) * time->timeScale,
+      1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps /
+          (time->tickStop - time->tickStart) / time->timeScale);
 }
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h
index e207f4158f06a1cdf74ccc4fd0eb982543de0f87..4eb16dd70d0a121488ae657442b7e950a0afd16a 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cpu/main.h
@@ -18,34 +18,35 @@
 
 #if !defined(SPEC_CPU)
 typedef struct {
-	float timeScale;
-	clock_t tickStart, tickStop;
-	struct tms timeStart, timeStop;
+  float timeScale;
+  clock_t tickStart, tickStop;
+  struct tms timeStart, timeStop;
 
 } MAIN_Time;
 #endif
 
-typedef enum {NOTHING = 0, COMPARE, STORE} MAIN_Action;
-typedef enum {LDC = 0, CHANNEL} MAIN_SimType;
+typedef enum { NOTHING = 0, COMPARE, STORE } MAIN_Action;
+typedef enum { LDC = 0, CHANNEL } MAIN_SimType;
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	MAIN_Action action;
-	MAIN_SimType simType;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  MAIN_Action action;
+  MAIN_SimType simType;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param );
-void MAIN_finalize( const MAIN_Param* param );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param);
+void MAIN_finalize(const MAIN_Param *param);
 
 #if !defined(SPEC_CPU)
-void MAIN_startClock( MAIN_Time* time );
-void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param );
+void MAIN_startClock(MAIN_Time *time);
+void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param);
 #endif
 
 /*############################################################################*/
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h
index 793109e2a547fc68978582f5f9514814a9272b58..9c3c00905cdf4ceab6a84341766c375499a8dc1a 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/layout_config.h
@@ -13,30 +13,30 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
-//Align rows of X to 128-bytes
+// Changeable settings
+// Padding in each dimension
+// Align rows of X to 128-bytes
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*(SIZE_Z))
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*(PADDED_Z))
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * (SIZE_Z))
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * (PADDED_Z))
 
-//Flattening function
+// Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 0
@@ -45,7 +45,7 @@
 #define SCATTER
 #endif
 
-//CUDA block size (not trivially changeable here)
+// CUDA block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h
index 6f3b1138805911d1053c3c78ebcf9496bec7604f..10c32ddd3761120bfb1073fe3c9ee65a9ef8cff6 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm.h
@@ -13,21 +13,38 @@
 
 /*############################################################################*/
 
-
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
-
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #include "layout_config.h"
 #include "lbm_macros.h"
@@ -36,23 +53,23 @@ typedef enum {OBSTACLE    = 1 << 0,
 #ifdef __cplusplus
 extern "C" {
 #endif
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( LBM_GridPtr grid1, LBM_GridPtr grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(LBM_GridPtr grid1, LBM_GridPtr grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* CUDA ***********************************************************************/
 
-void CUDA_LBM_allocateGrid( float** ptr );
-void CUDA_LBM_freeGrid( float** ptr );
-void CUDA_LBM_initializeGrid( float** d_grid, float** h_grid );
-void CUDA_LBM_getDeviceGrid( float** d_grid, float** h_grid );
-void CUDA_LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid );
+void CUDA_LBM_allocateGrid(float **ptr);
+void CUDA_LBM_freeGrid(float **ptr);
+void CUDA_LBM_initializeGrid(float **d_grid, float **h_grid);
+void CUDA_LBM_getDeviceGrid(float **d_grid, float **h_grid);
+void CUDA_LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid);
 #ifdef __cplusplus
 }
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h
index 9c3934c01f32db7b7b8e2075dda275f0c140f75a..7f5e522da17823325e1f74bd405faa4b560c1a31 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/lbm_macros.h
@@ -19,154 +19,175 @@
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* SCATTER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* SCATTER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc
index bc238e27340f6fdd16f96be059e5fc831d598609..6ed10cad68cd7c02b427ad58dbe784d5fd15b6aa 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.cc
@@ -18,159 +18,156 @@
 /*############################################################################*/
 static LBM_Grid CUDA_srcGrid, CUDA_dstGrid;
 
-
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-	MAIN_Param param;
-	int t;
-
-	pb_InitializeTimerSet(&timers);
-        struct pb_Parameters* params;
-        params = pb_ReadParameters(&nArgs, arg);
-        
-
-	static LBM_GridPtr TEMP_srcGrid;
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	MAIN_parseCommandLine( nArgs, arg, &param, params );
-	MAIN_printInfo( &param );
-
-	MAIN_initialize( &param );
-
-	for( t = 1; t <= param.nTimeSteps; t++ ) {
-                pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-		CUDA_LBM_performStreamCollide( CUDA_srcGrid, CUDA_dstGrid );
-                pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		LBM_swapGrids( &CUDA_srcGrid, &CUDA_dstGrid );
-
-		if( (t & 63) == 0 ) {
-			printf( "timestep: %i\n", t );
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
+
+  pb_InitializeTimerSet(&timers);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
+
+  static LBM_GridPtr TEMP_srcGrid;
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
+
+  MAIN_initialize(&param);
+
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+    CUDA_LBM_performStreamCollide(CUDA_srcGrid, CUDA_dstGrid);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_swapGrids(&CUDA_srcGrid, &CUDA_dstGrid);
+
+    if ((t & 63) == 0) {
+      printf("timestep: %i\n", t);
 #if 0
 			CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
 			LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-		}
-	}
+    }
+  }
 
-	MAIN_finalize( &param );
+  MAIN_finalize(&param);
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        pb_PrintTimerSet(&timers);
-        pb_FreeParameters(params);
-	return 0;
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-	struct stat fileStat;
-
-	if( nArgs < 2 ) {
-		printf( "syntax: lbm <time steps>\n" );
-		exit( 1 );
-	}
-
-	param->nTimeSteps     = atoi( arg[1] );
-
-	if( params->inpFiles[0] != NULL ) {
-		param->obstacleFilename = params->inpFiles[0];
-
-		if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-			printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-					param->obstacleFilename );
-			exit( 1 );
-		}
-		if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-			printf( "MAIN_parseCommandLine:\n"
-					"\tsize of file '%s' is %i bytes\n"
-					"\texpected size is %i bytes\n",
-					param->obstacleFilename, (int) fileStat.st_size,
-					SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-			exit( 1 );
-		}
-	}
-	else param->obstacleFilename = NULL;
-
-        param->resultFilename = params->outFile;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
+
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
+    }
+  } else
+    param->obstacleFilename = NULL;
+
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-	printf( "MAIN_printInfo:\n"
-			"\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-			"\tnTimeSteps     : %i\n"
-			"\tresult file    : %s\n"
-			"\taction         : %s\n"
-			"\tsimulation type: %s\n"
-			"\tobstacle file  : %s\n\n",
-			SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-			param->nTimeSteps, param->resultFilename, 
-			"store", "lid-driven cavity",
-			(param->obstacleFilename == NULL) ? "<none>" :
-			param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param ) {
-	static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+void MAIN_initialize(const MAIN_Param *param) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	LBM_allocateGrid( (float**) &TEMP_dstGrid );
-	LBM_initializeGrid( TEMP_srcGrid );
-	LBM_initializeGrid( TEMP_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	if( param->obstacleFilename != NULL ) {
-		LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-		LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-	}
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-	LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//Setup DEVICE datastructures
-	CUDA_LBM_allocateGrid( (float**) &CUDA_srcGrid );
-	CUDA_LBM_allocateGrid( (float**) &CUDA_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // Setup DEVICE datastructures
+  CUDA_LBM_allocateGrid((float **)&CUDA_srcGrid);
+  CUDA_LBM_allocateGrid((float **)&CUDA_dstGrid);
 
-	//Initialize DEVICE datastructures
-	CUDA_LBM_initializeGrid( (float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid );
-	CUDA_LBM_initializeGrid( (float**)&CUDA_dstGrid, (float**)&TEMP_dstGrid );
+  // Initialize DEVICE datastructures
+  CUDA_LBM_initializeGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid);
+  CUDA_LBM_initializeGrid((float **)&CUDA_dstGrid, (float **)&TEMP_dstGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
-	LBM_freeGrid( (float**) &TEMP_dstGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param ) {
-	LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param) {
+  LBM_Grid TEMP_srcGrid;
 
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  CUDA_LBM_getDeviceGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-	LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
-	CUDA_LBM_freeGrid( (float**) &CUDA_srcGrid );
-	CUDA_LBM_freeGrid( (float**) &CUDA_dstGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  CUDA_LBM_freeGrid((float **)&CUDA_srcGrid);
+  CUDA_LBM_freeGrid((float **)&CUDA_dstGrid);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h
index 1bb326e9a9b7df66cd6dda5c411c60e7db962da3..2094b326226a5a97ee15ac32dccdb8dde1e41583 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda/main.h
@@ -16,27 +16,30 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param );
-void MAIN_finalize( const MAIN_Param* param );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param);
+void MAIN_finalize(const MAIN_Param *param);
 
 /*############################################################################*/
 
 #ifndef __MCUDA__
-#define CUDA_ERRCK                                                      \
-  {cudaError_t err;                                                     \
-    if ((err = cudaGetLastError()) != cudaSuccess) {                    \
-      fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__, cudaGetErrorString(err)); \
-      exit(-1);                                                         \
-    }                                                                   \
+#define CUDA_ERRCK                                                             \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__,                 \
+              cudaGetErrorString(err));                                        \
+      exit(-1);                                                                \
+    }                                                                          \
   }
 #else
 #define CUDA_ERRCK
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h
index 0b7efa770cddad6d71af56c7bf695391a21ad48a..497829547c0dfb739f5c3bb3b22c0c871935054f 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/layout_config.h
@@ -13,34 +13,34 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
-//Note that the padding in the highest Cartesian dimension 
-// must be at least 4 to simplify the kernel by avoiding 
+// Changeable settings
+// Padding in each dimension
+// Note that the padding in the highest Cartesian dimension
+// must be at least 4 to simplify the kernel by avoiding
 // out-of-bounds access checks.
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*(SIZE_Z))
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * (SIZE_Z))
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
-//Flattening function
+// Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-//  The macro below implements the equivalent of a 3-D array of 
+//  The macro below implements the equivalent of a 3-D array of
 //  20-element structures in C standard layout.
-#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (e + N_CELL_ENTRIES * ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 0
@@ -49,7 +49,7 @@
 #define SCATTER
 #endif
 
-//CUDA block size (not trivially changeable here)
+// CUDA block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h
index 6f3b1138805911d1053c3c78ebcf9496bec7604f..10c32ddd3761120bfb1073fe3c9ee65a9ef8cff6 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm.h
@@ -13,21 +13,38 @@
 
 /*############################################################################*/
 
-
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
-
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #include "layout_config.h"
 #include "lbm_macros.h"
@@ -36,23 +53,23 @@ typedef enum {OBSTACLE    = 1 << 0,
 #ifdef __cplusplus
 extern "C" {
 #endif
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( LBM_GridPtr grid1, LBM_GridPtr grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(LBM_GridPtr grid1, LBM_GridPtr grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* CUDA ***********************************************************************/
 
-void CUDA_LBM_allocateGrid( float** ptr );
-void CUDA_LBM_freeGrid( float** ptr );
-void CUDA_LBM_initializeGrid( float** d_grid, float** h_grid );
-void CUDA_LBM_getDeviceGrid( float** d_grid, float** h_grid );
-void CUDA_LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid );
+void CUDA_LBM_allocateGrid(float **ptr);
+void CUDA_LBM_freeGrid(float **ptr);
+void CUDA_LBM_initializeGrid(float **d_grid, float **h_grid);
+void CUDA_LBM_getDeviceGrid(float **d_grid, float **h_grid);
+void CUDA_LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid);
 #ifdef __cplusplus
 }
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h
index 9c3934c01f32db7b7b8e2075dda275f0c140f75a..7f5e522da17823325e1f74bd405faa4b560c1a31 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/lbm_macros.h
@@ -19,154 +19,175 @@
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* SCATTER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* SCATTER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc
index bc238e27340f6fdd16f96be059e5fc831d598609..6ed10cad68cd7c02b427ad58dbe784d5fd15b6aa 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.cc
@@ -18,159 +18,156 @@
 /*############################################################################*/
 static LBM_Grid CUDA_srcGrid, CUDA_dstGrid;
 
-
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-	MAIN_Param param;
-	int t;
-
-	pb_InitializeTimerSet(&timers);
-        struct pb_Parameters* params;
-        params = pb_ReadParameters(&nArgs, arg);
-        
-
-	static LBM_GridPtr TEMP_srcGrid;
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	MAIN_parseCommandLine( nArgs, arg, &param, params );
-	MAIN_printInfo( &param );
-
-	MAIN_initialize( &param );
-
-	for( t = 1; t <= param.nTimeSteps; t++ ) {
-                pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-		CUDA_LBM_performStreamCollide( CUDA_srcGrid, CUDA_dstGrid );
-                pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		LBM_swapGrids( &CUDA_srcGrid, &CUDA_dstGrid );
-
-		if( (t & 63) == 0 ) {
-			printf( "timestep: %i\n", t );
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
+
+  pb_InitializeTimerSet(&timers);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
+
+  static LBM_GridPtr TEMP_srcGrid;
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
+
+  MAIN_initialize(&param);
+
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+    CUDA_LBM_performStreamCollide(CUDA_srcGrid, CUDA_dstGrid);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_swapGrids(&CUDA_srcGrid, &CUDA_dstGrid);
+
+    if ((t & 63) == 0) {
+      printf("timestep: %i\n", t);
 #if 0
 			CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
 			LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-		}
-	}
+    }
+  }
 
-	MAIN_finalize( &param );
+  MAIN_finalize(&param);
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        pb_PrintTimerSet(&timers);
-        pb_FreeParameters(params);
-	return 0;
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-	struct stat fileStat;
-
-	if( nArgs < 2 ) {
-		printf( "syntax: lbm <time steps>\n" );
-		exit( 1 );
-	}
-
-	param->nTimeSteps     = atoi( arg[1] );
-
-	if( params->inpFiles[0] != NULL ) {
-		param->obstacleFilename = params->inpFiles[0];
-
-		if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-			printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-					param->obstacleFilename );
-			exit( 1 );
-		}
-		if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-			printf( "MAIN_parseCommandLine:\n"
-					"\tsize of file '%s' is %i bytes\n"
-					"\texpected size is %i bytes\n",
-					param->obstacleFilename, (int) fileStat.st_size,
-					SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-			exit( 1 );
-		}
-	}
-	else param->obstacleFilename = NULL;
-
-        param->resultFilename = params->outFile;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
+
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
+    }
+  } else
+    param->obstacleFilename = NULL;
+
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-	printf( "MAIN_printInfo:\n"
-			"\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-			"\tnTimeSteps     : %i\n"
-			"\tresult file    : %s\n"
-			"\taction         : %s\n"
-			"\tsimulation type: %s\n"
-			"\tobstacle file  : %s\n\n",
-			SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-			param->nTimeSteps, param->resultFilename, 
-			"store", "lid-driven cavity",
-			(param->obstacleFilename == NULL) ? "<none>" :
-			param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param ) {
-	static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+void MAIN_initialize(const MAIN_Param *param) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	LBM_allocateGrid( (float**) &TEMP_dstGrid );
-	LBM_initializeGrid( TEMP_srcGrid );
-	LBM_initializeGrid( TEMP_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	if( param->obstacleFilename != NULL ) {
-		LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-		LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-	}
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-	LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//Setup DEVICE datastructures
-	CUDA_LBM_allocateGrid( (float**) &CUDA_srcGrid );
-	CUDA_LBM_allocateGrid( (float**) &CUDA_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // Setup DEVICE datastructures
+  CUDA_LBM_allocateGrid((float **)&CUDA_srcGrid);
+  CUDA_LBM_allocateGrid((float **)&CUDA_dstGrid);
 
-	//Initialize DEVICE datastructures
-	CUDA_LBM_initializeGrid( (float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid );
-	CUDA_LBM_initializeGrid( (float**)&CUDA_dstGrid, (float**)&TEMP_dstGrid );
+  // Initialize DEVICE datastructures
+  CUDA_LBM_initializeGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid);
+  CUDA_LBM_initializeGrid((float **)&CUDA_dstGrid, (float **)&TEMP_dstGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
-	LBM_freeGrid( (float**) &TEMP_dstGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param ) {
-	LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param) {
+  LBM_Grid TEMP_srcGrid;
 
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  CUDA_LBM_getDeviceGrid((float **)&CUDA_srcGrid, (float **)&TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-	LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
-	CUDA_LBM_freeGrid( (float**) &CUDA_srcGrid );
-	CUDA_LBM_freeGrid( (float**) &CUDA_dstGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  CUDA_LBM_freeGrid((float **)&CUDA_srcGrid);
+  CUDA_LBM_freeGrid((float **)&CUDA_dstGrid);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h
index 1bb326e9a9b7df66cd6dda5c411c60e7db962da3..2094b326226a5a97ee15ac32dccdb8dde1e41583 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/cuda_base/main.h
@@ -16,27 +16,30 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param );
-void MAIN_finalize( const MAIN_Param* param );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param);
+void MAIN_finalize(const MAIN_Param *param);
 
 /*############################################################################*/
 
 #ifndef __MCUDA__
-#define CUDA_ERRCK                                                      \
-  {cudaError_t err;                                                     \
-    if ((err = cudaGetLastError()) != cudaSuccess) {                    \
-      fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__, cudaGetErrorString(err)); \
-      exit(-1);                                                         \
-    }                                                                   \
+#define CUDA_ERRCK                                                             \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      fprintf(stderr, "CUDA error on line %d: %s\n", __LINE__,                 \
+              cudaGetErrorString(err));                                        \
+      exit(-1);                                                                \
+    }                                                                          \
   }
 #else
 #define CUDA_ERRCK
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h
index ce9ce82c4acc351d7d239f3053023e964490eabe..0cd4bd055875c814b1712939b73179f7607043ad 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/config.h
@@ -14,7 +14,7 @@
 
 #define OMEGA (1.95f)
 
-#define OUTPUT_PRECISION float 
+#define OUTPUT_PRECISION float
 
 #define BOOL int
 #define TRUE (-1)
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c
index 81294ac4455b4a92dfe80b7cb5d0ac0696a4b027..e6ea7c4d621e8470680a125bca11f70a634f2a56 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.c
@@ -4,9 +4,8 @@
 
 #include "lbm.h"
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
-
+#include <stdlib.h>
 
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
@@ -16,674 +15,757 @@
 
 /*############################################################################*/
 
-#define DFL1 (1.0/ 3.0)
-#define DFL2 (1.0/18.0)
-#define DFL3 (1.0/36.0)
+#define DFL1 (1.0 / 3.0)
+#define DFL2 (1.0 / 18.0)
+#define DFL3 (1.0 / 36.0)
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES,
-	             size   = sizeof( LBM_Grid ) + 2*margin*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES,
+               size = sizeof(LBM_Grid) + 2 * margin * sizeof(float);
 
-	*ptr = malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-		        size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  *ptr = malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 #if !defined(SPEC_CPU)
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-	        size / (1024.0*1024.0) );
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
 #endif
-	*ptr += margin;
+  *ptr += margin;
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	const size_t margin = 2*SIZE_X*SIZE_Y*N_CELL_ENTRIES;
+void LBM_freeGrid(float **ptr) {
+  const size_t margin = 2 * SIZE_X * SIZE_Y * N_CELL_ENTRIES;
 
-	free( *ptr-margin );
-	*ptr = NULL;
+  free(*ptr - margin);
+  *ptr = NULL;
 }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
 #pragma omp parallel for
 #endif
 #endif
-	SWEEP_START( 0, 0, -2, 0, 0, SIZE_Z+2 )
-		LOCAL( grid, C  ) = DFL1;
-		LOCAL( grid, N  ) = DFL2;
-		LOCAL( grid, S  ) = DFL2;
-		LOCAL( grid, E  ) = DFL2;
-		LOCAL( grid, W  ) = DFL2;
-		LOCAL( grid, T  ) = DFL2;
-		LOCAL( grid, B  ) = DFL2;
-		LOCAL( grid, NE ) = DFL3;
-		LOCAL( grid, NW ) = DFL3;
-		LOCAL( grid, SE ) = DFL3;
-		LOCAL( grid, SW ) = DFL3;
-		LOCAL( grid, NT ) = DFL3;
-		LOCAL( grid, NB ) = DFL3;
-		LOCAL( grid, ST ) = DFL3;
-		LOCAL( grid, SB ) = DFL3;
-		LOCAL( grid, ET ) = DFL3;
-		LOCAL( grid, EB ) = DFL3;
-		LOCAL( grid, WT ) = DFL3;
-		LOCAL( grid, WB ) = DFL3;
-
-		CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+  SWEEP_START(0, 0, -2, 0, 0, SIZE_Z + 2)
+  LOCAL(grid, C) = DFL1;
+  LOCAL(grid, N) = DFL2;
+  LOCAL(grid, S) = DFL2;
+  LOCAL(grid, E) = DFL2;
+  LOCAL(grid, W) = DFL2;
+  LOCAL(grid, T) = DFL2;
+  LOCAL(grid, B) = DFL2;
+  LOCAL(grid, NE) = DFL3;
+  LOCAL(grid, NW) = DFL3;
+  LOCAL(grid, SE) = DFL3;
+  LOCAL(grid, SW) = DFL3;
+  LOCAL(grid, NT) = DFL3;
+  LOCAL(grid, NB) = DFL3;
+  LOCAL(grid, ST) = DFL3;
+  LOCAL(grid, SB) = DFL3;
+  LOCAL(grid, ET) = DFL3;
+  LOCAL(grid, EB) = DFL3;
+  LOCAL(grid, WT) = DFL3;
+  LOCAL(grid, WB) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 ) {
-	LBM_GridPtr aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2) {
+  LBM_GridPtr aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( x, y )
+#pragma omp parallel for private(x, y)
 #endif
 #endif
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-				    y == 0 || y == SIZE_Y-1 ||
-				    z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-					     x > 1 && x < SIZE_X-2 &&
-					     y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForChannel( LBM_Grid grid ) {
-	int x,  y,  z;
+void LBM_initializeSpecialCellsForChannel(LBM_Grid grid) {
+  int x, y, z;
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( x, y )
+#pragma omp parallel for private(x, y)
 #endif
 #endif
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-				    y == 0 || y == SIZE_Y-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-
-					if( (z == 0 || z == SIZE_Z-1) &&
-					    ! TEST_FLAG( grid, x, y, z, OBSTACLE ))
-						SET_FLAG( grid, x, y, z, IN_OUT_FLOW );
-				}
-			}
-		}
-	}
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+
+          if ((z == 0 || z == SIZE_Z - 1) &&
+              !TEST_FLAG(grid, x, y, z, OBSTACLE))
+            SET_FLAG(grid, x, y, z, IN_OUT_FLOW);
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid ) {
-	SWEEP_VAR
+void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid) {
+  SWEEP_VAR
 
-	float ux, uy, uz, u2, rho;
+  float ux, uy, uz, u2, rho;
 
-	/*voption indep*/
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( ux, uy, uz, u2, rho )
+#pragma omp parallel for private(ux, uy, uz, u2, rho)
 #endif
 #endif
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		if( TEST_FLAG_SWEEP( srcGrid, OBSTACLE )) {
-			DST_C ( dstGrid ) = SRC_C ( srcGrid );
-			DST_S ( dstGrid ) = SRC_N ( srcGrid );
-			DST_N ( dstGrid ) = SRC_S ( srcGrid );
-			DST_W ( dstGrid ) = SRC_E ( srcGrid );
-			DST_E ( dstGrid ) = SRC_W ( srcGrid );
-			DST_B ( dstGrid ) = SRC_T ( srcGrid );
-			DST_T ( dstGrid ) = SRC_B ( srcGrid );
-			DST_SW( dstGrid ) = SRC_NE( srcGrid );
-			DST_SE( dstGrid ) = SRC_NW( srcGrid );
-			DST_NW( dstGrid ) = SRC_SE( srcGrid );
-			DST_NE( dstGrid ) = SRC_SW( srcGrid );
-			DST_SB( dstGrid ) = SRC_NT( srcGrid );
-			DST_ST( dstGrid ) = SRC_NB( srcGrid );
-			DST_NB( dstGrid ) = SRC_ST( srcGrid );
-			DST_NT( dstGrid ) = SRC_SB( srcGrid );
-			DST_WB( dstGrid ) = SRC_ET( srcGrid );
-			DST_WT( dstGrid ) = SRC_EB( srcGrid );
-			DST_EB( dstGrid ) = SRC_WT( srcGrid );
-			DST_ET( dstGrid ) = SRC_WB( srcGrid );
-			continue;
-		}
-
-		rho = + SRC_C ( srcGrid ) + SRC_N ( srcGrid )
-		      + SRC_S ( srcGrid ) + SRC_E ( srcGrid )
-		      + SRC_W ( srcGrid ) + SRC_T ( srcGrid )
-		      + SRC_B ( srcGrid ) + SRC_NE( srcGrid )
-		      + SRC_NW( srcGrid ) + SRC_SE( srcGrid )
-		      + SRC_SW( srcGrid ) + SRC_NT( srcGrid )
-		      + SRC_NB( srcGrid ) + SRC_ST( srcGrid )
-		      + SRC_SB( srcGrid ) + SRC_ET( srcGrid )
-		      + SRC_EB( srcGrid ) + SRC_WT( srcGrid )
-		      + SRC_WB( srcGrid );
-
-		ux = + SRC_E ( srcGrid ) - SRC_W ( srcGrid )
-		     + SRC_NE( srcGrid ) - SRC_NW( srcGrid )
-		     + SRC_SE( srcGrid ) - SRC_SW( srcGrid )
-		     + SRC_ET( srcGrid ) + SRC_EB( srcGrid )
-		     - SRC_WT( srcGrid ) - SRC_WB( srcGrid );
-		uy = + SRC_N ( srcGrid ) - SRC_S ( srcGrid )
-		     + SRC_NE( srcGrid ) + SRC_NW( srcGrid )
-		     - SRC_SE( srcGrid ) - SRC_SW( srcGrid )
-		     + SRC_NT( srcGrid ) + SRC_NB( srcGrid )
-		     - SRC_ST( srcGrid ) - SRC_SB( srcGrid );
-		uz = + SRC_T ( srcGrid ) - SRC_B ( srcGrid )
-		     + SRC_NT( srcGrid ) - SRC_NB( srcGrid )
-		     + SRC_ST( srcGrid ) - SRC_SB( srcGrid )
-		     + SRC_ET( srcGrid ) - SRC_EB( srcGrid )
-		     + SRC_WT( srcGrid ) - SRC_WB( srcGrid );
-
-		ux /= rho;
-		uy /= rho;
-		uz /= rho;
-
-		if( TEST_FLAG_SWEEP( srcGrid, ACCEL )) {
-			ux = 0.005f;
-			uy = 0.002f;
-			uz = 0.000f;
-		}
-
-		u2 = 1.5f * (ux*ux + uy*uy + uz*uz);
-		DST_C ( dstGrid ) = (1.0f-OMEGA)*SRC_C ( srcGrid ) + DFL1*OMEGA*rho*(1.0f                                 - u2);
-
-		DST_N ( dstGrid ) = (1.0f-OMEGA)*SRC_N ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uy*(4.5f*uy       + 3.0f) - u2);
-		DST_S ( dstGrid ) = (1.0f-OMEGA)*SRC_S ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uy*(4.5f*uy       - 3.0f) - u2);
-		DST_E ( dstGrid ) = (1.0f-OMEGA)*SRC_E ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       ux*(4.5f*ux       + 3.0f) - u2);
-		DST_W ( dstGrid ) = (1.0f-OMEGA)*SRC_W ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       ux*(4.5f*ux       - 3.0f) - u2);
-		DST_T ( dstGrid ) = (1.0f-OMEGA)*SRC_T ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uz*(4.5f*uz       + 3.0f) - u2);
-		DST_B ( dstGrid ) = (1.0f-OMEGA)*SRC_B ( srcGrid ) + DFL2*OMEGA*rho*(1.0f +       uz*(4.5f*uz       - 3.0f) - u2);
-
-		DST_NE( dstGrid ) = (1.0f-OMEGA)*SRC_NE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
-		DST_NW( dstGrid ) = (1.0f-OMEGA)*SRC_NW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
-		DST_SE( dstGrid ) = (1.0f-OMEGA)*SRC_SE( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
-		DST_SW( dstGrid ) = (1.0f-OMEGA)*SRC_SW( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
-		DST_NT( dstGrid ) = (1.0f-OMEGA)*SRC_NT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
-		DST_NB( dstGrid ) = (1.0f-OMEGA)*SRC_NB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
-		DST_ST( dstGrid ) = (1.0f-OMEGA)*SRC_ST( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
-		DST_SB( dstGrid ) = (1.0f-OMEGA)*SRC_SB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
-		DST_ET( dstGrid ) = (1.0f-OMEGA)*SRC_ET( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
-		DST_EB( dstGrid ) = (1.0f-OMEGA)*SRC_EB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
-		DST_WT( dstGrid ) = (1.0f-OMEGA)*SRC_WT( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
-		DST_WB( dstGrid ) = (1.0f-OMEGA)*SRC_WB( srcGrid ) + DFL3*OMEGA*rho*(1.0f + (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
-	SWEEP_END
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  if (TEST_FLAG_SWEEP(srcGrid, OBSTACLE)) {
+    DST_C(dstGrid) = SRC_C(srcGrid);
+    DST_S(dstGrid) = SRC_N(srcGrid);
+    DST_N(dstGrid) = SRC_S(srcGrid);
+    DST_W(dstGrid) = SRC_E(srcGrid);
+    DST_E(dstGrid) = SRC_W(srcGrid);
+    DST_B(dstGrid) = SRC_T(srcGrid);
+    DST_T(dstGrid) = SRC_B(srcGrid);
+    DST_SW(dstGrid) = SRC_NE(srcGrid);
+    DST_SE(dstGrid) = SRC_NW(srcGrid);
+    DST_NW(dstGrid) = SRC_SE(srcGrid);
+    DST_NE(dstGrid) = SRC_SW(srcGrid);
+    DST_SB(dstGrid) = SRC_NT(srcGrid);
+    DST_ST(dstGrid) = SRC_NB(srcGrid);
+    DST_NB(dstGrid) = SRC_ST(srcGrid);
+    DST_NT(dstGrid) = SRC_SB(srcGrid);
+    DST_WB(dstGrid) = SRC_ET(srcGrid);
+    DST_WT(dstGrid) = SRC_EB(srcGrid);
+    DST_EB(dstGrid) = SRC_WT(srcGrid);
+    DST_ET(dstGrid) = SRC_WB(srcGrid);
+    continue;
+  }
+
+  rho = +SRC_C(srcGrid) + SRC_N(srcGrid) + SRC_S(srcGrid) + SRC_E(srcGrid) +
+        SRC_W(srcGrid) + SRC_T(srcGrid) + SRC_B(srcGrid) + SRC_NE(srcGrid) +
+        SRC_NW(srcGrid) + SRC_SE(srcGrid) + SRC_SW(srcGrid) + SRC_NT(srcGrid) +
+        SRC_NB(srcGrid) + SRC_ST(srcGrid) + SRC_SB(srcGrid) + SRC_ET(srcGrid) +
+        SRC_EB(srcGrid) + SRC_WT(srcGrid) + SRC_WB(srcGrid);
+
+  ux = +SRC_E(srcGrid) - SRC_W(srcGrid) + SRC_NE(srcGrid) - SRC_NW(srcGrid) +
+       SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_ET(srcGrid) + SRC_EB(srcGrid) -
+       SRC_WT(srcGrid) - SRC_WB(srcGrid);
+  uy = +SRC_N(srcGrid) - SRC_S(srcGrid) + SRC_NE(srcGrid) + SRC_NW(srcGrid) -
+       SRC_SE(srcGrid) - SRC_SW(srcGrid) + SRC_NT(srcGrid) + SRC_NB(srcGrid) -
+       SRC_ST(srcGrid) - SRC_SB(srcGrid);
+  uz = +SRC_T(srcGrid) - SRC_B(srcGrid) + SRC_NT(srcGrid) - SRC_NB(srcGrid) +
+       SRC_ST(srcGrid) - SRC_SB(srcGrid) + SRC_ET(srcGrid) - SRC_EB(srcGrid) +
+       SRC_WT(srcGrid) - SRC_WB(srcGrid);
+
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (TEST_FLAG_SWEEP(srcGrid, ACCEL)) {
+    ux = 0.005f;
+    uy = 0.002f;
+    uz = 0.000f;
+  }
+
+  u2 = 1.5f * (ux * ux + uy * uy + uz * uz);
+  DST_C(dstGrid) =
+      (1.0f - OMEGA) * SRC_C(srcGrid) + DFL1 * OMEGA * rho * (1.0f - u2);
+
+  DST_N(dstGrid) = (1.0f - OMEGA) * SRC_N(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy + 3.0f) - u2);
+  DST_S(dstGrid) = (1.0f - OMEGA) * SRC_S(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uy * (4.5f * uy - 3.0f) - u2);
+  DST_E(dstGrid) = (1.0f - OMEGA) * SRC_E(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux + 3.0f) - u2);
+  DST_W(dstGrid) = (1.0f - OMEGA) * SRC_W(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + ux * (4.5f * ux - 3.0f) - u2);
+  DST_T(dstGrid) = (1.0f - OMEGA) * SRC_T(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz + 3.0f) - u2);
+  DST_B(dstGrid) = (1.0f - OMEGA) * SRC_B(srcGrid) +
+                   DFL2 * OMEGA * rho * (1.0f + uz * (4.5f * uz - 3.0f) - u2);
+
+  DST_NE(dstGrid) = (1.0f - OMEGA) * SRC_NE(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux + uy) * (4.5f * (+ux + uy) + 3.0f) - u2);
+  DST_NW(dstGrid) = (1.0f - OMEGA) * SRC_NW(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux + uy) * (4.5f * (-ux + uy) + 3.0f) - u2);
+  DST_SE(dstGrid) = (1.0f - OMEGA) * SRC_SE(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux - uy) * (4.5f * (+ux - uy) + 3.0f) - u2);
+  DST_SW(dstGrid) = (1.0f - OMEGA) * SRC_SW(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux - uy) * (4.5f * (-ux - uy) + 3.0f) - u2);
+  DST_NT(dstGrid) = (1.0f - OMEGA) * SRC_NT(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+uy + uz) * (4.5f * (+uy + uz) + 3.0f) - u2);
+  DST_NB(dstGrid) = (1.0f - OMEGA) * SRC_NB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+uy - uz) * (4.5f * (+uy - uz) + 3.0f) - u2);
+  DST_ST(dstGrid) = (1.0f - OMEGA) * SRC_ST(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-uy + uz) * (4.5f * (-uy + uz) + 3.0f) - u2);
+  DST_SB(dstGrid) = (1.0f - OMEGA) * SRC_SB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-uy - uz) * (4.5f * (-uy - uz) + 3.0f) - u2);
+  DST_ET(dstGrid) = (1.0f - OMEGA) * SRC_ET(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux + uz) * (4.5f * (+ux + uz) + 3.0f) - u2);
+  DST_EB(dstGrid) = (1.0f - OMEGA) * SRC_EB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (+ux - uz) * (4.5f * (+ux - uz) + 3.0f) - u2);
+  DST_WT(dstGrid) = (1.0f - OMEGA) * SRC_WT(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux + uz) * (4.5f * (-ux + uz) + 3.0f) - u2);
+  DST_WB(dstGrid) = (1.0f - OMEGA) * SRC_WB(srcGrid) +
+                    DFL3 * OMEGA * rho *
+                        (1.0f + (-ux - uz) * (4.5f * (-ux - uz) + 3.0f) - u2);
+  SWEEP_END
 }
 
 /*############################################################################*/
 
-void LBM_handleInOutFlow( LBM_Grid srcGrid ) {
-	float ux , uy , uz , rho ,
-	       ux1, uy1, uz1, rho1,
-	       ux2, uy2, uz2, rho2,
-	       u2, px, py;
-	SWEEP_VAR
+void LBM_handleInOutFlow(LBM_Grid srcGrid) {
+  float ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2, uy2, uz2, rho2, u2, px, py;
+  SWEEP_VAR
 
-	/* inflow */
-	/*voption indep*/
+  /* inflow */
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \
-                                  ux2, uy2, uz2, rho2, u2, px, py )
+#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2,    \
+                                 uy2, uz2, rho2, u2, px, py)
 #endif
 #endif
-	SWEEP_START( 0, 0, 0, 0, 0, 1 )
-		rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 1, WB );
-		rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, 2, WB );
-
-		rho = 2.0*rho1 - rho2;
-
-		px = (SWEEP_X / (0.5*(SIZE_X-1))) - 1.0;
-		py = (SWEEP_Y / (0.5*(SIZE_Y-1))) - 1.0;
-		ux = 0.00;
-		uy = 0.00;
-		uz = 0.01 * (1.0-px*px) * (1.0-py*py);
-
-		u2 = 1.5 * (ux*ux + uy*uy + uz*uz);
-
-		LOCAL( srcGrid, C ) = DFL1*rho*(1.0                                 - u2);
-
-		LOCAL( srcGrid, N ) = DFL2*rho*(1.0 +       uy*(4.5*uy       + 3.0) - u2);
-		LOCAL( srcGrid, S ) = DFL2*rho*(1.0 +       uy*(4.5*uy       - 3.0) - u2);
-		LOCAL( srcGrid, E ) = DFL2*rho*(1.0 +       ux*(4.5*ux       + 3.0) - u2);
-		LOCAL( srcGrid, W ) = DFL2*rho*(1.0 +       ux*(4.5*ux       - 3.0) - u2);
-		LOCAL( srcGrid, T ) = DFL2*rho*(1.0 +       uz*(4.5*uz       + 3.0) - u2);
-		LOCAL( srcGrid, B ) = DFL2*rho*(1.0 +       uz*(4.5*uz       - 3.0) - u2);
-
-		LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2);
-		LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2);
-	SWEEP_END
-
-	/* outflow */
-	/*voption indep*/
+  SWEEP_START(0, 0, 0, 0, 0, 1)
+  rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 1, WB);
+  rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, 2, WB);
+
+  rho = 2.0 * rho1 - rho2;
+
+  px = (SWEEP_X / (0.5 * (SIZE_X - 1))) - 1.0;
+  py = (SWEEP_Y / (0.5 * (SIZE_Y - 1))) - 1.0;
+  ux = 0.00;
+  uy = 0.00;
+  uz = 0.01 * (1.0 - px * px) * (1.0 - py * py);
+
+  u2 = 1.5 * (ux * ux + uy * uy + uz * uz);
+
+  LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2);
+
+  LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2);
+  LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2);
+  LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2);
+  LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2);
+  LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2);
+  LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2);
+
+  LOCAL(srcGrid, NE) =
+      DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, NW) =
+      DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, SE) =
+      DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, SW) =
+      DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, NT) =
+      DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, NB) =
+      DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ST) =
+      DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, SB) =
+      DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ET) =
+      DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, EB) =
+      DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2);
+  LOCAL(srcGrid, WT) =
+      DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, WB) =
+      DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2);
+  SWEEP_END
+
+  /* outflow */
+  /*voption indep*/
 #if !defined(SPEC_CPU)
 #ifdef _OPENMP
-#pragma omp parallel for private( ux, uy, uz, rho, ux1, uy1, uz1, rho1, \
-                                  ux2, uy2, uz2, rho2, u2, px, py )
+#pragma omp parallel for private(ux, uy, uz, rho, ux1, uy1, uz1, rho1, ux2,    \
+                                 uy2, uz2, rho2, u2, px, py)
 #endif
 #endif
 
-	SWEEP_START( 0, 0, SIZE_Z-1, 0, 0, SIZE_Z )
-		rho1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB );
-		ux1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, E  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, W  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB );
-		uy1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, N  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, S  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NW )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB );
-		uz1 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, T  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, B  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, NB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, SB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, EB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -1, WB );
-
-		ux1 /= rho1;
-		uy1 /= rho1;
-		uz1 /= rho1;
-
-		rho2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, C  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T  )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B  ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT )
-		       + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB );
-		ux2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, E  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, W  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB );
-		uy2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, N  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, S  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NE ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NW )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SE ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SW )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB )
-		      - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB );
-		uz2 = + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, T  ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, B  )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, NB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ST ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, SB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, ET ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, EB )
-		      + GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WT ) - GRID_ENTRY_SWEEP( srcGrid, 0, 0, -2, WB );
-
-		ux2 /= rho2;
-		uy2 /= rho2;
-		uz2 /= rho2;
-
-		rho = 1.0;
-
-		ux = 2*ux1 - ux2;
-		uy = 2*uy1 - uy2;
-		uz = 2*uz1 - uz2;
-
-		u2 = 1.5 * (ux*ux + uy*uy + uz*uz);
-
-		LOCAL( srcGrid, C ) = DFL1*rho*(1.0                                 - u2);
-
-		LOCAL( srcGrid, N ) = DFL2*rho*(1.0 +       uy*(4.5*uy       + 3.0) - u2);
-		LOCAL( srcGrid, S ) = DFL2*rho*(1.0 +       uy*(4.5*uy       - 3.0) - u2);
-		LOCAL( srcGrid, E ) = DFL2*rho*(1.0 +       ux*(4.5*ux       + 3.0) - u2);
-		LOCAL( srcGrid, W ) = DFL2*rho*(1.0 +       ux*(4.5*ux       - 3.0) - u2);
-		LOCAL( srcGrid, T ) = DFL2*rho*(1.0 +       uz*(4.5*uz       + 3.0) - u2);
-		LOCAL( srcGrid, B ) = DFL2*rho*(1.0 +       uz*(4.5*uz       - 3.0) - u2);
-
-		LOCAL( srcGrid, NE) = DFL3*rho*(1.0 + (+ux+uy)*(4.5*(+ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, NW) = DFL3*rho*(1.0 + (-ux+uy)*(4.5*(-ux+uy) + 3.0) - u2);
-		LOCAL( srcGrid, SE) = DFL3*rho*(1.0 + (+ux-uy)*(4.5*(+ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, SW) = DFL3*rho*(1.0 + (-ux-uy)*(4.5*(-ux-uy) + 3.0) - u2);
-		LOCAL( srcGrid, NT) = DFL3*rho*(1.0 + (+uy+uz)*(4.5*(+uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, NB) = DFL3*rho*(1.0 + (+uy-uz)*(4.5*(+uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ST) = DFL3*rho*(1.0 + (-uy+uz)*(4.5*(-uy+uz) + 3.0) - u2);
-		LOCAL( srcGrid, SB) = DFL3*rho*(1.0 + (-uy-uz)*(4.5*(-uy-uz) + 3.0) - u2);
-		LOCAL( srcGrid, ET) = DFL3*rho*(1.0 + (+ux+uz)*(4.5*(+ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, EB) = DFL3*rho*(1.0 + (+ux-uz)*(4.5*(+ux-uz) + 3.0) - u2);
-		LOCAL( srcGrid, WT) = DFL3*rho*(1.0 + (-ux+uz)*(4.5*(-ux+uz) + 3.0) - u2);
-		LOCAL( srcGrid, WB) = DFL3*rho*(1.0 + (-ux-uz)*(4.5*(-ux-uz) + 3.0) - u2);
-	SWEEP_END
+  SWEEP_START(0, 0, SIZE_Z - 1, 0, 0, SIZE_Z)
+  rho1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB);
+  ux1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, E) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, W) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB);
+  uy1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, N) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, S) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NE) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NW) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB);
+  uz1 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, T) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, B) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, NB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, SB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, ET) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, EB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -1, WB);
+
+  ux1 /= rho1;
+  uy1 /= rho1;
+  uz1 /= rho1;
+
+  rho2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, C) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) +
+         GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB);
+  ux2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, E) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, W) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB);
+  uy2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, N) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, S) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NE) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NW) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SE) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SW) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB);
+  uz2 = +GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, T) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, B) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, NB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ST) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, SB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, ET) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, EB) +
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WT) -
+        GRID_ENTRY_SWEEP(srcGrid, 0, 0, -2, WB);
+
+  ux2 /= rho2;
+  uy2 /= rho2;
+  uz2 /= rho2;
+
+  rho = 1.0;
+
+  ux = 2 * ux1 - ux2;
+  uy = 2 * uy1 - uy2;
+  uz = 2 * uz1 - uz2;
+
+  u2 = 1.5 * (ux * ux + uy * uy + uz * uz);
+
+  LOCAL(srcGrid, C) = DFL1 * rho * (1.0 - u2);
+
+  LOCAL(srcGrid, N) = DFL2 * rho * (1.0 + uy * (4.5 * uy + 3.0) - u2);
+  LOCAL(srcGrid, S) = DFL2 * rho * (1.0 + uy * (4.5 * uy - 3.0) - u2);
+  LOCAL(srcGrid, E) = DFL2 * rho * (1.0 + ux * (4.5 * ux + 3.0) - u2);
+  LOCAL(srcGrid, W) = DFL2 * rho * (1.0 + ux * (4.5 * ux - 3.0) - u2);
+  LOCAL(srcGrid, T) = DFL2 * rho * (1.0 + uz * (4.5 * uz + 3.0) - u2);
+  LOCAL(srcGrid, B) = DFL2 * rho * (1.0 + uz * (4.5 * uz - 3.0) - u2);
+
+  LOCAL(srcGrid, NE) =
+      DFL3 * rho * (1.0 + (+ux + uy) * (4.5 * (+ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, NW) =
+      DFL3 * rho * (1.0 + (-ux + uy) * (4.5 * (-ux + uy) + 3.0) - u2);
+  LOCAL(srcGrid, SE) =
+      DFL3 * rho * (1.0 + (+ux - uy) * (4.5 * (+ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, SW) =
+      DFL3 * rho * (1.0 + (-ux - uy) * (4.5 * (-ux - uy) + 3.0) - u2);
+  LOCAL(srcGrid, NT) =
+      DFL3 * rho * (1.0 + (+uy + uz) * (4.5 * (+uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, NB) =
+      DFL3 * rho * (1.0 + (+uy - uz) * (4.5 * (+uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ST) =
+      DFL3 * rho * (1.0 + (-uy + uz) * (4.5 * (-uy + uz) + 3.0) - u2);
+  LOCAL(srcGrid, SB) =
+      DFL3 * rho * (1.0 + (-uy - uz) * (4.5 * (-uy - uz) + 3.0) - u2);
+  LOCAL(srcGrid, ET) =
+      DFL3 * rho * (1.0 + (+ux + uz) * (4.5 * (+ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, EB) =
+      DFL3 * rho * (1.0 + (+ux - uz) * (4.5 * (+ux - uz) + 3.0) - u2);
+  LOCAL(srcGrid, WT) =
+      DFL3 * rho * (1.0 + (-ux + uz) * (4.5 * (-ux + uz) + 3.0) - u2);
+  LOCAL(srcGrid, WB) =
+      DFL3 * rho * (1.0 + (-ux - uz) * (4.5 * (-ux - uz) + 3.0) - u2);
+  SWEEP_END
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = + LOCAL( grid, C  ) + LOCAL( grid, N  )
-		      + LOCAL( grid, S  ) + LOCAL( grid, E  )
-		      + LOCAL( grid, W  ) + LOCAL( grid, T  )
-		      + LOCAL( grid, B  ) + LOCAL( grid, NE )
-		      + LOCAL( grid, NW ) + LOCAL( grid, SE )
-		      + LOCAL( grid, SW ) + LOCAL( grid, NT )
-		      + LOCAL( grid, NB ) + LOCAL( grid, ST )
-		      + LOCAL( grid, SB ) + LOCAL( grid, ET )
-		      + LOCAL( grid, EB ) + LOCAL( grid, WT )
-		      + LOCAL( grid, WB );
-		if( rho < minRho ) minRho = rho;
-		if( rho > maxRho ) maxRho = rho;
-		mass += rho;
-
-		if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-			nObstacleCells++;
-		}
-		else {
-			if( TEST_FLAG_SWEEP( grid, ACCEL ))
-				nAccelCells++;
-			else
-				nFluidCells++;
-
-			ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			     + LOCAL( grid, NE ) - LOCAL( grid, NW )
-			     + LOCAL( grid, SE ) - LOCAL( grid, SW )
-			     + LOCAL( grid, ET ) + LOCAL( grid, EB )
-			     - LOCAL( grid, WT ) - LOCAL( grid, WB );
-			uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			     + LOCAL( grid, NE ) + LOCAL( grid, NW )
-			     - LOCAL( grid, SE ) - LOCAL( grid, SW )
-			     + LOCAL( grid, NT ) + LOCAL( grid, NB )
-			     - LOCAL( grid, ST ) - LOCAL( grid, SB );
-			uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			     + LOCAL( grid, NT ) - LOCAL( grid, NB )
-			     + LOCAL( grid, ST ) - LOCAL( grid, SB )
-			     + LOCAL( grid, ET ) - LOCAL( grid, EB )
-			     + LOCAL( grid, WT ) - LOCAL( grid, WB );
-			u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-			if( u2 < minU2 ) minU2 = u2;
-			if( u2 > maxU2 ) maxU2 = u2;
-		}
-	SWEEP_END
-
-        printf( "LBM_showGridStatistics:\n"
-        "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-        "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-        "\tminU: %e maxU: %e\n\n",
-        nObstacleCells, nAccelCells, nFluidCells,
-        minRho, maxRho, mass,
-        sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = +LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                             const int binary ) {
-	int x, y, z;
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				rho = + GRID_ENTRY( grid, x, y, z, C  ) + GRID_ENTRY( grid, x, y, z, N  )
-				      + GRID_ENTRY( grid, x, y, z, S  ) + GRID_ENTRY( grid, x, y, z, E  )
-				      + GRID_ENTRY( grid, x, y, z, W  ) + GRID_ENTRY( grid, x, y, z, T  )
-				      + GRID_ENTRY( grid, x, y, z, B  ) + GRID_ENTRY( grid, x, y, z, NE )
-				      + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE )
-				      + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT )
-				      + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST )
-				      + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET )
-				      + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT )
-				      + GRID_ENTRY( grid, x, y, z, WB );
-				ux = + GRID_ENTRY( grid, x, y, z, E  ) - GRID_ENTRY( grid, x, y, z, W  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) 
-				     + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) 
-				     - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				uy = + GRID_ENTRY( grid, x, y, z, N  ) - GRID_ENTRY( grid, x, y, z, S  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) 
-				     - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) 
-				     - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB );
-				uz = + GRID_ENTRY( grid, x, y, z, T  ) - GRID_ENTRY( grid, x, y, z, B  ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) 
-				     + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) 
-				     + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					fwrite( &ux, sizeof( ux ), 1, file );
-					fwrite( &uy, sizeof( uy ), 1, file );
-					fwrite( &uz, sizeof( uz ), 1, file );
-					*/
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-			}
-		}
-	}
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  int x, y, z;
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) +
+              GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) +
+              GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) +
+              GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) +
+              GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) +
+              GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) +
+              GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) +
+              GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) +
+              GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) +
+              GRID_ENTRY(grid, x, y, z, WB);
+        ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) +
+             GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) +
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) -
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) +
+             GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) -
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) -
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB);
+        uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) +
+             GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) +
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) +
+             GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) +
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        ux /= rho;
+        uy /= rho;
+        uz /= rho;
+
+        if (binary) {
+          /*
+          fwrite( &ux, sizeof( ux ), 1, file );
+          fwrite( &uy, sizeof( uy ), 1, file );
+          fwrite( &uz, sizeof( uz ), 1, file );
+          */
+          storeValue(file, &ux);
+          storeValue(file, &uy);
+          storeValue(file, &uz);
+        } else
+          fprintf(file, "%e %e %e\n", ux, uy, uz);
+      }
+    }
+  }
+
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_compareVelocityField( LBM_Grid grid, const char* filename,
-                             const int binary ) {
-	int x, y, z;
-	float rho, ux, uy, uz;
-	OUTPUT_PRECISION fileUx, fileUy, fileUz,
-	                 dUx, dUy, dUz,
-	                 diff2, maxDiff2 = -1e+30;
-
-	FILE* file = fopen( filename, (binary ? "rb" : "r") );
-
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				rho = + GRID_ENTRY( grid, x, y, z, C  ) + GRID_ENTRY( grid, x, y, z, N  )
-				      + GRID_ENTRY( grid, x, y, z, S  ) + GRID_ENTRY( grid, x, y, z, E  )
-				      + GRID_ENTRY( grid, x, y, z, W  ) + GRID_ENTRY( grid, x, y, z, T  )
-				      + GRID_ENTRY( grid, x, y, z, B  ) + GRID_ENTRY( grid, x, y, z, NE )
-				      + GRID_ENTRY( grid, x, y, z, NW ) + GRID_ENTRY( grid, x, y, z, SE )
-				      + GRID_ENTRY( grid, x, y, z, SW ) + GRID_ENTRY( grid, x, y, z, NT )
-				      + GRID_ENTRY( grid, x, y, z, NB ) + GRID_ENTRY( grid, x, y, z, ST )
-				      + GRID_ENTRY( grid, x, y, z, SB ) + GRID_ENTRY( grid, x, y, z, ET )
-				      + GRID_ENTRY( grid, x, y, z, EB ) + GRID_ENTRY( grid, x, y, z, WT )
-				      + GRID_ENTRY( grid, x, y, z, WB );
-				ux = + GRID_ENTRY( grid, x, y, z, E  ) - GRID_ENTRY( grid, x, y, z, W  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) - GRID_ENTRY( grid, x, y, z, NW ) 
-				     + GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) + GRID_ENTRY( grid, x, y, z, EB ) 
-				     - GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				uy = + GRID_ENTRY( grid, x, y, z, N  ) - GRID_ENTRY( grid, x, y, z, S  ) 
-				     + GRID_ENTRY( grid, x, y, z, NE ) + GRID_ENTRY( grid, x, y, z, NW ) 
-				     - GRID_ENTRY( grid, x, y, z, SE ) - GRID_ENTRY( grid, x, y, z, SW ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) + GRID_ENTRY( grid, x, y, z, NB ) 
-				     - GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB );
-				uz = + GRID_ENTRY( grid, x, y, z, T  ) - GRID_ENTRY( grid, x, y, z, B  ) 
-				     + GRID_ENTRY( grid, x, y, z, NT ) - GRID_ENTRY( grid, x, y, z, NB ) 
-				     + GRID_ENTRY( grid, x, y, z, ST ) - GRID_ENTRY( grid, x, y, z, SB ) 
-				     + GRID_ENTRY( grid, x, y, z, ET ) - GRID_ENTRY( grid, x, y, z, EB ) 
-				     + GRID_ENTRY( grid, x, y, z, WT ) - GRID_ENTRY( grid, x, y, z, WB );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					loadValue( file, &fileUx );
-					loadValue( file, &fileUy );
-					loadValue( file, &fileUz );
-				}
-				else {
-					if( sizeof( OUTPUT_PRECISION ) == sizeof( double )) {
-						fscanf( file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz );
-					}
-					else {
-						fscanf( file, "%f %f %f\n", &fileUx, &fileUy, &fileUz );
-					}
-				}
-
-				dUx = ux - fileUx;
-				dUy = uy - fileUy;
-				dUz = uz - fileUz;
-				diff2 = dUx*dUx + dUy*dUy + dUz*dUz;
-				if( diff2 > maxDiff2 ) maxDiff2 = diff2;
-			}
-		}
-	}
+void LBM_compareVelocityField(LBM_Grid grid, const char *filename,
+                              const int binary) {
+  int x, y, z;
+  float rho, ux, uy, uz;
+  OUTPUT_PRECISION fileUx, fileUy, fileUz, dUx, dUy, dUz, diff2,
+      maxDiff2 = -1e+30;
+
+  FILE *file = fopen(filename, (binary ? "rb" : "r"));
+
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        rho = +GRID_ENTRY(grid, x, y, z, C) + GRID_ENTRY(grid, x, y, z, N) +
+              GRID_ENTRY(grid, x, y, z, S) + GRID_ENTRY(grid, x, y, z, E) +
+              GRID_ENTRY(grid, x, y, z, W) + GRID_ENTRY(grid, x, y, z, T) +
+              GRID_ENTRY(grid, x, y, z, B) + GRID_ENTRY(grid, x, y, z, NE) +
+              GRID_ENTRY(grid, x, y, z, NW) + GRID_ENTRY(grid, x, y, z, SE) +
+              GRID_ENTRY(grid, x, y, z, SW) + GRID_ENTRY(grid, x, y, z, NT) +
+              GRID_ENTRY(grid, x, y, z, NB) + GRID_ENTRY(grid, x, y, z, ST) +
+              GRID_ENTRY(grid, x, y, z, SB) + GRID_ENTRY(grid, x, y, z, ET) +
+              GRID_ENTRY(grid, x, y, z, EB) + GRID_ENTRY(grid, x, y, z, WT) +
+              GRID_ENTRY(grid, x, y, z, WB);
+        ux = +GRID_ENTRY(grid, x, y, z, E) - GRID_ENTRY(grid, x, y, z, W) +
+             GRID_ENTRY(grid, x, y, z, NE) - GRID_ENTRY(grid, x, y, z, NW) +
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, ET) + GRID_ENTRY(grid, x, y, z, EB) -
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        uy = +GRID_ENTRY(grid, x, y, z, N) - GRID_ENTRY(grid, x, y, z, S) +
+             GRID_ENTRY(grid, x, y, z, NE) + GRID_ENTRY(grid, x, y, z, NW) -
+             GRID_ENTRY(grid, x, y, z, SE) - GRID_ENTRY(grid, x, y, z, SW) +
+             GRID_ENTRY(grid, x, y, z, NT) + GRID_ENTRY(grid, x, y, z, NB) -
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB);
+        uz = +GRID_ENTRY(grid, x, y, z, T) - GRID_ENTRY(grid, x, y, z, B) +
+             GRID_ENTRY(grid, x, y, z, NT) - GRID_ENTRY(grid, x, y, z, NB) +
+             GRID_ENTRY(grid, x, y, z, ST) - GRID_ENTRY(grid, x, y, z, SB) +
+             GRID_ENTRY(grid, x, y, z, ET) - GRID_ENTRY(grid, x, y, z, EB) +
+             GRID_ENTRY(grid, x, y, z, WT) - GRID_ENTRY(grid, x, y, z, WB);
+        ux /= rho;
+        uy /= rho;
+        uz /= rho;
+
+        if (binary) {
+          loadValue(file, &fileUx);
+          loadValue(file, &fileUy);
+          loadValue(file, &fileUz);
+        } else {
+          if (sizeof(OUTPUT_PRECISION) == sizeof(double)) {
+            fscanf(file, "%lf %lf %lf\n", &fileUx, &fileUy, &fileUz);
+          } else {
+            fscanf(file, "%f %f %f\n", &fileUx, &fileUy, &fileUz);
+          }
+        }
+
+        dUx = ux - fileUx;
+        dUy = uy - fileUy;
+        dUz = uz - fileUz;
+        diff2 = dUx * dUx + dUy * dUy + dUz * dUz;
+        if (diff2 > maxDiff2)
+          maxDiff2 = diff2;
+      }
+    }
+  }
 
 #if defined(SPEC_CPU)
-	printf( "LBM_compareVelocityField: maxDiff = %e  \n\n",
-	        sqrt( maxDiff2 )  );
+  printf("LBM_compareVelocityField: maxDiff = %e  \n\n", sqrt(maxDiff2));
 #else
-	printf( "LBM_compareVelocityField: maxDiff = %e  ==>  %s\n\n",
-	        sqrt( maxDiff2 ),
-	        sqrt( maxDiff2 ) > 1e-5 ? "##### ERROR #####" : "OK" );
+  printf("LBM_compareVelocityField: maxDiff = %e  ==>  %s\n\n", sqrt(maxDiff2),
+         sqrt(maxDiff2) > 1e-5 ? "##### ERROR #####" : "OK");
 #endif
-	fclose( file );
+  fclose(file);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h
index e35818c0b300593f382a61131e7a35584d35cee1..94189f0f2bcc080ed79e42941b5a0638649d46e3 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm.h
@@ -18,30 +18,31 @@ typedef enum {C = 0,
               NT, NB, ST, SB,
               ET, EB, WT, WB,
               FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
-	      */
+              */
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
-
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_initializeSpecialCellsForChannel( LBM_Grid grid );
-void LBM_swapGrids( LBM_GridPtr* grid1, LBM_GridPtr* grid2 );
-void LBM_performStreamCollide( LBM_Grid srcGrid, LBM_Grid dstGrid );
-void LBM_handleInOutFlow( LBM_Grid srcGrid );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
-void LBM_compareVelocityField( LBM_Grid grid, const char* filename,
-                             const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_initializeSpecialCellsForChannel(LBM_Grid grid);
+void LBM_swapGrids(LBM_GridPtr *grid1, LBM_GridPtr *grid2);
+void LBM_performStreamCollide(LBM_Grid srcGrid, LBM_Grid dstGrid);
+void LBM_handleInOutFlow(LBM_Grid srcGrid);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
+void LBM_compareVelocityField(LBM_Grid grid, const char *filename,
+                              const BOOL binary);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h
index 42c999e204dffc83c1affe8d56e086dcf1815b43..92b4c1b21dc9d87531691b3fce4bd1ff01b201f8 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/lbm_1d_array.h
@@ -3,163 +3,204 @@
 #ifndef _LBM_MACROS_H_
 #define _LBM_MACROS_H_
 
-typedef enum {C = 0,
-    N, S, E, W, T, B,
-    NE, NW, SE, SW,
-    NT, NB, ST, SB,
-    ET, EB, WT, WB,
-    FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
-#define SIZE   (120)
-#define SIZE_X (1*SIZE)
-#define SIZE_Y (1*SIZE)
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
+#define SIZE (120)
+#define SIZE_X (1 * SIZE)
+#define SIZE_Y (1 * SIZE)
 #define SIZE_Z (150)
 /*############################################################################*/
 
-typedef float LBM_Grid[SIZE_Z*SIZE_Y*SIZE_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float LBM_Grid[SIZE_Z * SIZE_Y * SIZE_X * N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-#define CALC_INDEX(x,y,z,e) ((e)+N_CELL_ENTRIES*((x)+ \
-						 (y)*SIZE_X+(z)*SIZE_X*SIZE_Y))
+#define CALC_INDEX(x, y, z, e)                                                 \
+  ((e) + N_CELL_ENTRIES * ((x) + (y)*SIZE_X + (z)*SIZE_X * SIZE_Y))
 
 #define SWEEP_VAR int i;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-  for( i = CALC_INDEX(x1, y1, z1, 0); \
-       i < CALC_INDEX(x2, y2, z2, 0); \
-       i += N_CELL_ENTRIES ) {
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (i = CALC_INDEX(x1, y1, z1, 0); i < CALC_INDEX(x2, y2, z2, 0);           \
+       i += N_CELL_ENTRIES) {
 
 #define SWEEP_END }
 
-#define SWEEP_X  ((i / N_CELL_ENTRIES) % SIZE_X)
+#define SWEEP_X ((i / N_CELL_ENTRIES) % SIZE_X)
 #define SWEEP_Y (((i / N_CELL_ENTRIES) / SIZE_X) % SIZE_Y)
-#define SWEEP_Z  ((i / N_CELL_ENTRIES) / (SIZE_X*SIZE_Y))
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX(dx, dy, dz, e)+(i)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_Z ((i / N_CELL_ENTRIES) / (SIZE_X * SIZE_Y))
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX(dx, dy, dz, e) + (i)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #define COLLIDE_STREAM
 #ifdef COLLIDE_STREAM
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* COLLIDE_STREAM */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* COLLIDE_STREAM */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* const _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *const _aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c
index 85600dbfdf20059a71694b7ae72f0243ee5c82eb..6985e3e58b300a7fad88ed4623340562693c80bd 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.c
@@ -8,10 +8,10 @@
 #include <stdlib.h>
 
 #if defined(SPEC_CPU)
-#   include <time.h>
+#include <time.h>
 #else
-#   include <sys/times.h>
-#   include <unistd.h>
+#include <sys/times.h>
+#include <unistd.h>
 #endif
 
 #include <sys/stat.h>
@@ -23,168 +23,169 @@ static LBM_GridPtr srcGrid, dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-	MAIN_Param param;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
 #if !defined(SPEC_CPU)
-	MAIN_Time time;
+  MAIN_Time time;
 #endif
-	int t;
+  int t;
 
-        pb_InitializeTimerSet(&timers);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        struct pb_Parameters* params;
-        params = pb_ReadParameters(&nArgs, arg);
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
-	MAIN_parseCommandLine( nArgs, arg, &param, params );
-	MAIN_printInfo( &param );
-	MAIN_initialize( &param );
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
+  MAIN_initialize(&param);
 #if !defined(SPEC_CPU)
-	MAIN_startClock( &time );
+  MAIN_startClock(&time);
 #endif
 
-	for( t = 1; t <= param.nTimeSteps; t++ ) {
-		if( param.simType == CHANNEL ) {
-			LBM_handleInOutFlow( *srcGrid );
-		}
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    if (param.simType == CHANNEL) {
+      LBM_handleInOutFlow(*srcGrid);
+    }
 
-		LBM_performStreamCollide( *srcGrid, *dstGrid );
-		LBM_swapGrids( &srcGrid, &dstGrid );
+    LBM_performStreamCollide(*srcGrid, *dstGrid);
+    LBM_swapGrids(&srcGrid, &dstGrid);
 
-		if( (t & 63) == 0 ) {
-			printf( "timestep: %i\n", t );
-			//LBM_showGridStatistics( *srcGrid );
-		}
-	}
+    if ((t & 63) == 0) {
+      printf("timestep: %i\n", t);
+      // LBM_showGridStatistics( *srcGrid );
+    }
+  }
 
 #if !defined(SPEC_CPU)
-	MAIN_stopClock( &time, &param );
+  MAIN_stopClock(&time, &param);
 #endif
 
-	MAIN_finalize( &param );
+  MAIN_finalize(&param);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        pb_PrintTimerSet(&timers);
-        pb_FreeParameters(params);
-	return 0;
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params) {
-	struct stat fileStat;
-	
-	if( nArgs < 2 ) {
-		printf( "syntax: lbm <time steps>\n" );
-		exit( 1 );
-	}
-
-	param->nTimeSteps     = atoi( arg[1] );
-
-	if( params->inpFiles[0] != NULL ) {
-		param->obstacleFilename = params->inpFiles[0];
-
-		if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-			printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-			         param->obstacleFilename );
-			exit( 1 );
-		}
-		if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-			printf( "MAIN_parseCommandLine:\n"
-			        "\tsize of file '%s' is %i bytes\n"
-					    "\texpected size is %i bytes\n",
-			        param->obstacleFilename, (int) fileStat.st_size,
-			        SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-			exit( 1 );
-		}
-	}
-	else param->obstacleFilename = NULL;
-
-	param->resultFilename = params->outFile;
-	param->action         = STORE;
-	param->simType        = LDC;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
+
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
+    }
+  } else
+    param->obstacleFilename = NULL;
+
+  param->resultFilename = params->outFile;
+  param->action = STORE;
+  param->simType = LDC;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-	const char actionString[3][32] = {"nothing", "compare", "store"};
-	const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"};
-	printf( "MAIN_printInfo:\n"
-	        "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-	        "\tnTimeSteps     : %i\n"
-	        "\tresult file    : %s\n"
-	        "\taction         : %s\n"
-	        "\tsimulation type: %s\n"
-	        "\tobstacle file  : %s\n\n",
-	        SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-	        param->nTimeSteps, param->resultFilename, 
-	        actionString[param->action], simTypeString[param->simType],
-	        (param->obstacleFilename == NULL) ? "<none>" :
-	                                            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  const char actionString[3][32] = {"nothing", "compare", "store"};
+  const char simTypeString[3][32] = {"lid-driven cavity", "channel flow"};
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, actionString[param->action],
+         simTypeString[param->simType],
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param ) {
-	LBM_allocateGrid( (float**) &srcGrid );
-	LBM_allocateGrid( (float**) &dstGrid );
+void MAIN_initialize(const MAIN_Param *param) {
+  LBM_allocateGrid((float **)&srcGrid);
+  LBM_allocateGrid((float **)&dstGrid);
 
-	LBM_initializeGrid( *srcGrid );
-	LBM_initializeGrid( *dstGrid );
+  LBM_initializeGrid(*srcGrid);
+  LBM_initializeGrid(*dstGrid);
 
-	if( param->obstacleFilename != NULL ) {
-		LBM_loadObstacleFile( *srcGrid, param->obstacleFilename );
-		LBM_loadObstacleFile( *dstGrid, param->obstacleFilename );
-	}
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(*srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(*dstGrid, param->obstacleFilename);
+  }
 
-	if( param->simType == CHANNEL ) {
-		LBM_initializeSpecialCellsForChannel( *srcGrid );
-		LBM_initializeSpecialCellsForChannel( *dstGrid );
-	}
-	else {
-		LBM_initializeSpecialCellsForLDC( *srcGrid );
-		LBM_initializeSpecialCellsForLDC( *dstGrid );
-	}
+  if (param->simType == CHANNEL) {
+    LBM_initializeSpecialCellsForChannel(*srcGrid);
+    LBM_initializeSpecialCellsForChannel(*dstGrid);
+  } else {
+    LBM_initializeSpecialCellsForLDC(*srcGrid);
+    LBM_initializeSpecialCellsForLDC(*dstGrid);
+  }
 
-	LBM_showGridStatistics( *srcGrid );
+  LBM_showGridStatistics(*srcGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param ) {
-	LBM_showGridStatistics( *srcGrid );
+void MAIN_finalize(const MAIN_Param *param) {
+  LBM_showGridStatistics(*srcGrid);
 
-	if( param->action == COMPARE )
-		LBM_compareVelocityField( *srcGrid, param->resultFilename, TRUE );
-	if( param->action == STORE )
-	LBM_storeVelocityField( *srcGrid, param->resultFilename, TRUE );
+  if (param->action == COMPARE)
+    LBM_compareVelocityField(*srcGrid, param->resultFilename, TRUE);
+  if (param->action == STORE)
+    LBM_storeVelocityField(*srcGrid, param->resultFilename, TRUE);
 
-	LBM_freeGrid( (float**) &srcGrid );
-	LBM_freeGrid( (float**) &dstGrid );
+  LBM_freeGrid((float **)&srcGrid);
+  LBM_freeGrid((float **)&dstGrid);
 }
 
 #if !defined(SPEC_CPU)
 /*############################################################################*/
 
-void MAIN_startClock( MAIN_Time* time ) {
-	time->timeScale = 1.0 / sysconf( _SC_CLK_TCK );
-	time->tickStart = times( &(time->timeStart) );
+void MAIN_startClock(MAIN_Time *time) {
+  time->timeScale = 1.0 / sysconf(_SC_CLK_TCK);
+  time->tickStart = times(&(time->timeStart));
 }
 
-
 /*############################################################################*/
 
-void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param ) {
-	time->tickStop = times( &(time->timeStop) );
-
-	printf( "MAIN_stopClock:\n"
-	        "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n",
-	        (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale,
-	        (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale,
-	        (time->timeStop.tms_utime - time->timeStart.tms_utime +
-	         time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale,
-	        (time->tickStop           - time->tickStart          ) * time->timeScale,
-	        1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps /
-	        (time->tickStop           - time->tickStart          ) / time->timeScale );
+void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param) {
+  time->tickStop = times(&(time->timeStop));
+
+  printf(
+      "MAIN_stopClock:\n"
+      "\tusr: %7.2f sys: %7.2f tot: %7.2f wct: %7.2f MLUPS: %5.2f\n\n",
+      (time->timeStop.tms_utime - time->timeStart.tms_utime) * time->timeScale,
+      (time->timeStop.tms_stime - time->timeStart.tms_stime) * time->timeScale,
+      (time->timeStop.tms_utime - time->timeStart.tms_utime +
+       time->timeStop.tms_stime - time->timeStart.tms_stime) *
+          time->timeScale,
+      (time->tickStop - time->tickStart) * time->timeScale,
+      1.0e-6 * SIZE_X * SIZE_Y * SIZE_Z * param->nTimeSteps /
+          (time->tickStop - time->tickStart) / time->timeScale);
 }
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h
index e207f4158f06a1cdf74ccc4fd0eb982543de0f87..4eb16dd70d0a121488ae657442b7e950a0afd16a 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/omp_cpu/main.h
@@ -18,34 +18,35 @@
 
 #if !defined(SPEC_CPU)
 typedef struct {
-	float timeScale;
-	clock_t tickStart, tickStop;
-	struct tms timeStart, timeStop;
+  float timeScale;
+  clock_t tickStart, tickStop;
+  struct tms timeStart, timeStop;
 
 } MAIN_Time;
 #endif
 
-typedef enum {NOTHING = 0, COMPARE, STORE} MAIN_Action;
-typedef enum {LDC = 0, CHANNEL} MAIN_SimType;
+typedef enum { NOTHING = 0, COMPARE, STORE } MAIN_Action;
+typedef enum { LDC = 0, CHANNEL } MAIN_SimType;
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	MAIN_Action action;
-	MAIN_SimType simType;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  MAIN_Action action;
+  MAIN_SimType simType;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param );
-void MAIN_finalize( const MAIN_Param* param );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param);
+void MAIN_finalize(const MAIN_Param *param);
 
 #if !defined(SPEC_CPU)
-void MAIN_startClock( MAIN_Time* time );
-void MAIN_stopClock( MAIN_Time* time, const MAIN_Param* param );
+void MAIN_startClock(MAIN_Time *time);
+void MAIN_stopClock(MAIN_Time *time, const MAIN_Param *param);
 #endif
 
 /*############################################################################*/
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h
index ef75410c43c337651291d2b27655ab26d73485d9..57b6b0875204536ee7cb7a5b12fb9e120e246fec 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/layout_config.h
@@ -13,33 +13,33 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
-//Flattening function
+// Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-//  The macro below implements the equivalent of a 3-D array of 
+//  The macro below implements the equivalent of a 3-D array of
 //  20-element structures in C standard layout.
-#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (e + N_CELL_ENTRIES * ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -48,22 +48,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c
index 7a7a539232830dce79bb6664ea05eec91be8a4bb..6bc8c020cc457210124c6b21b3dc337239d222a3 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.c
@@ -10,346 +10,320 @@
 
 // includes, system
 #include <CL/cl.h>
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "ocl.h"
-#include "lbm.h"
 
 /******************************************************************************/
 
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
-	 
-	cl_int clStatus;
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid) {
 
-	clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
-	CHECK_ERROR("clSetKernelArg")
+  cl_int clStatus;
 
-	clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
-	CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
+  CHECK_ERROR("clSetKernelArg")
 
-	size_t dimBlock[3] = {SIZE_X,1,1};
-	size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-	clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); 
-	CHECK_ERROR("clEnqueueNDRangeKernel") 	
-	
-	clStatus = clFinish(prm->clCommandQueue);
-	CHECK_ERROR("clFinish")
+  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  size_t dimBlock[3] = {SIZE_X, 1, 1};
+  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
+                                    dimGrid, dimBlock, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(prm->clCommandQueue);
+  CHECK_ERROR("clFinish")
 }
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	memset( *ptr, 0, size );
+  memset(*ptr, 0, size);
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
+
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-	cl_int clStatus;
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
   size_t max_alloc_size = 0;
-	clGetDeviceInfo(prm->clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, 
+  clGetDeviceInfo(prm->clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
                   sizeof(max_alloc_size), &max_alloc_size, NULL);
   if (max_alloc_size < size) {
     fprintf(stderr, "Can't allocate buffer: max alloc size is %dMB\n",
-            (int) (max_alloc_size >> 20));
+            (int)(max_alloc_size >> 20));
     exit(-1);
   }
-	*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
+  *ptr =
+      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_freeGrid(cl_mem ptr) {
-	clReleaseMemObject(ptr);
-}
+void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); 
-	cl_int clStatus;
-	clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                  h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
 }
 
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-        cl_int clStatus;
-        clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                 h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
-	cl_mem aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
+  cl_mem aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h
index 9dcf4639faf25701b015e0d3e6dcf0f9400b1745..64a617feb862bdffdcb0c6aa57b0f1b09c26debb 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm.h
@@ -13,23 +13,26 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* OpenCL *********************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
-void OpenCL_LBM_freeGrid( cl_mem ptr );
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
+void OpenCL_LBM_freeGrid(cl_mem ptr);
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid);
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid);
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h
index 24fad43205f11da1c05cc8aa5895e7aa2688d3f4..99c50c048a14bb47bb3659b61f088db95706bb0c 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c
index ac972815b190d1f91ba9c78512fbebb503501d14..193dec15418f96d53198c1a07ab3affdee3e956e 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.c
@@ -9,16 +9,16 @@
 /*############################################################################*/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
-#include <parboil.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
-#include "ocl.h"
 #include "main.h"
-#include "lbm.h"
+#include "ocl.h"
 
 /*############################################################################*/
 
@@ -27,202 +27,205 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-	MAIN_Param param;
-	int t;
-
-	OpenCL_Param prm;
-
-	pb_InitializeTimerSet(&timers);
-        struct pb_Parameters* params;
-        params = pb_ReadParameters(&nArgs, arg);
-        
-
-	static LBM_GridPtr TEMP_srcGrid;
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	MAIN_parseCommandLine( nArgs, arg, &param, params );
-	MAIN_printInfo( &param );
-
-	OpenCL_initialize(&prm);
-	MAIN_initialize( &param, &prm );
-	
-	for( t = 1; t <= param.nTimeSteps; t++ ) {
-                pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-		OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-                pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
-
-		if( (t & 63) == 0 ) {
-			printf( "timestep: %i\n", t );
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
+
+  OpenCL_Param prm;
+
+  pb_InitializeTimerSet(&timers);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
+
+  static LBM_GridPtr TEMP_srcGrid;
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
+
+  OpenCL_initialize(&prm);
+  MAIN_initialize(&param, &prm);
+
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+    OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
+
+    if ((t & 63) == 0) {
+      printf("timestep: %i\n", t);
 #if 0
 			CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
 			LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-		}
-	}
-	
-	MAIN_finalize( &param, &prm );
+    }
+  }
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
+  MAIN_finalize(&param, &prm);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        pb_PrintTimerSet(&timers);
-        pb_FreeParameters(params);
-	return 0;
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-	struct stat fileStat;
-
-	if( nArgs < 2 ) {
-		printf( "syntax: lbm <time steps>\n" );
-		exit( 1 );
-	}
-
-	param->nTimeSteps     = atoi( arg[1] );
-
-	if( params->inpFiles[0] != NULL ) {
-		param->obstacleFilename = params->inpFiles[0];
-
-		if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-			printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-					param->obstacleFilename );
-			exit( 1 );
-		}
-		if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-			printf( "MAIN_parseCommandLine:\n"
-					"\tsize of file '%s' is %i bytes\n"
-					"\texpected size is %i bytes\n",
-					param->obstacleFilename, (int) fileStat.st_size,
-					SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-			exit( 1 );
-		}
-	}
-	else param->obstacleFilename = NULL;
-
-        param->resultFilename = params->outFile;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
+
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
+    }
+  } else
+    param->obstacleFilename = NULL;
+
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-	printf( "MAIN_printInfo:\n"
-			"\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-			"\tnTimeSteps     : %i\n"
-			"\tresult file    : %s\n"
-			"\taction         : %s\n"
-			"\tsimulation type: %s\n"
-			"\tobstacle file  : %s\n\n",
-			SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-			param->nTimeSteps, param->resultFilename, 
-			"store", "lid-driven cavity",
-			(param->obstacleFilename == NULL) ? "<none>" :
-			param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-	static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	LBM_allocateGrid( (float**) &TEMP_dstGrid );
-	LBM_initializeGrid( TEMP_srcGrid );
-	LBM_initializeGrid( TEMP_dstGrid );
-
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	if( param->obstacleFilename != NULL ) {
-		LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-		LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-	}
-        
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-	LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-	
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//Setup DEVICE datastructures
-	OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-	OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
-	
-	//Initialize DEVICE datastructures
-	OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-	OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
-	
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
-
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
-	LBM_freeGrid( (float**) &TEMP_dstGrid );
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // Setup DEVICE datastructures
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
+
+  // Initialize DEVICE datastructures
+  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
+
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-	LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  LBM_Grid TEMP_srcGrid;
 
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-	LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
-	OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-	OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-	clReleaseProgram(prm->clProgram);
-	clReleaseKernel(prm->clKernel);
-	clReleaseCommandQueue(prm->clCommandQueue);
-	clReleaseContext(prm->clContext);
-	
+  clReleaseProgram(prm->clProgram);
+  clReleaseKernel(prm->clKernel);
+  clReleaseCommandQueue(prm->clCommandQueue);
+  clReleaseContext(prm->clContext);
 }
 
-void OpenCL_initialize(OpenCL_Param* prm)
-{
-	cl_int clStatus;
-	
-	clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL);
-	CHECK_ERROR("clGetPlatformIDs")
+void OpenCL_initialize(OpenCL_Param *prm) {
+  cl_int clStatus;
+
+  clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  prm->clCps[0] = CL_CONTEXT_PLATFORM;
+  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+  prm->clCps[2] = 0;
 
-	prm->clCps[0] = CL_CONTEXT_PLATFORM;
-	prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-	prm->clCps[2] = 0;
+  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1,
+                            &(prm->clDevice), NULL);
+  CHECK_ERROR("clGetDeviceIDs")
 
-	clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL);
-	CHECK_ERROR("clGetDeviceIDs")
+  prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL,
+                                           NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
 
-	prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
+  prm->clCommandQueue = clCreateCommandQueue(
+      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
 
-	prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
+  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
 
-  	pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  prm->clProgram =
+      clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
 
-	const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-	prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
+  char clOptions[100];
+  sprintf(clOptions, "-I src/opencl_base");
 
-	char clOptions[100];
-	sprintf(clOptions,"-I src/opencl_base");
-		
-	clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
+  clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions,
+                            NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
 
-	prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
-	CHECK_ERROR("clCreateKernel")
+  prm->clKernel =
+      clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus);
+  CHECK_ERROR("clCreateKernel")
 
-	free((void*)clSource[0]);
+  free((void *)clSource[0]);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h
index feee4e8768b13f0975481b1e3a5505ad3cdd018f..9d8e145c93b37488a3826e77b964c56699377d2a 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/main.h
@@ -12,19 +12,20 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
 
-void OpenCL_initialize(OpenCL_Param* prm);
+void OpenCL_initialize(OpenCL_Param *prm);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c
index 292f55728a7b78c9448300637369fb0044fa6f4d..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.c
@@ -1,40 +1,36 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "ocl.h"
 
-char* readFile(char* fileName)
-{
-	FILE* fp;
-	fp = fopen(fileName,"r");
+char *readFile(char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
 
-	if(fp == NULL)
-	{
-		printf("Error 1!\n");
-		return NULL;
-	}
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    return NULL;
+  }
 
-	fseek(fp,0,SEEK_END);
-	long size = ftell(fp);
-	rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-	char* buffer = malloc(sizeof(char)*(size+1));
-	if(buffer == NULL)
-	{
-		printf("Error 2!\n");
-		fclose(fp);
-		return NULL;
-	}
+  char *buffer = malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	size_t res = fread(buffer,1,size,fp);
-	if(res != size)
-	{
-		printf("Error 3!\n");
-		fclose(fp);
-		return NULL;
-	}
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	buffer[size] = 0;
-	fclose(fp);
-	return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h
index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_base/ocl.h
@@ -2,24 +2,22 @@
 #define __OCLH__
 
 typedef struct {
-	cl_platform_id clPlatform;
-	cl_context_properties clCps[3];
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_program clProgram;
-	cl_kernel clKernel;
+  cl_platform_id clPlatform;
+  cl_context_properties clCps[3];
+  cl_device_id clDevice;
+  cl_context clContext;
+  cl_command_queue clCommandQueue;
+  cl_program clProgram;
+  cl_kernel clKernel;
 } OpenCL_Param;
 
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-char* readFile(char*);
+char *readFile(char *);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h
index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/layout_config.h
@@ -13,31 +13,31 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
 //  Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -46,22 +46,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c
index 8cae2c1c172ff66c001627cd24389edd74a44472..26d90928b500d0ae5e5630dbe20b2f57e9f202c2 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.c
@@ -10,345 +10,319 @@
 
 // includes, system
 #include <CL/cl.h>
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "ocl.h"
-#include "lbm.h"
 
 /******************************************************************************/
 
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
-	 
-	cl_int clStatus;
-
-        size_t bytes = 100;
-	clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(prm->clKernel,1,sizeof(size_t),(void*)&bytes);
-	CHECK_ERROR("clSetKernelArg")
-	
-        clStatus = clSetKernelArg(prm->clKernel,2,sizeof(cl_mem),(void*)&dstGrid);
-	CHECK_ERROR("clSetKernelArg")
-	
-	clStatus = clSetKernelArg(prm->clKernel,3,sizeof(size_t),(void*)&bytes);
-	CHECK_ERROR("clSetKernelArg")
-	
-        size_t dimBlock[3] = {SIZE_X,1,1};
-	size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-	clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); 
-	CHECK_ERROR("clEnqueueNDRangeKernel") 	
-	
-	clStatus = clFinish(prm->clCommandQueue);
-	CHECK_ERROR("clFinish")
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid) {
+
+  cl_int clStatus;
+
+  size_t bytes = 100;
+  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(size_t), (void *)&bytes);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 2, sizeof(cl_mem), (void *)&dstGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 3, sizeof(size_t), (void *)&bytes);
+  CHECK_ERROR("clSetKernelArg")
+
+  size_t dimBlock[3] = {SIZE_X, 1, 1};
+  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
+                                    dimGrid, dimBlock, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(prm->clCommandQueue);
+  CHECK_ERROR("clFinish")
 }
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	memset( *ptr, 0, size );
+  memset(*ptr, 0, size);
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
+
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-	cl_int clStatus;
-	*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  *ptr =
+      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_freeGrid(cl_mem ptr) {
-	clReleaseMemObject(ptr);
-}
+void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); 
-	cl_int clStatus;
-	clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                  h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
 }
 
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-        cl_int clStatus;
-        clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                 h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
-	cl_mem aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
+  cl_mem aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h
index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm.h
@@ -13,23 +13,26 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* OpenCL *********************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
-void OpenCL_LBM_freeGrid( cl_mem ptr );
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
+void OpenCL_LBM_freeGrid(cl_mem ptr);
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid);
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid);
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h
index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c
index 3bf89f4b8a03dec812196187cc2f4bcbd328de24..59aa8daf9a018348274e20653c9c92f6995a96e4 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.c
@@ -15,10 +15,10 @@
 #include <sys/stat.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
-#include "ocl.h"
 #include "main.h"
-#include "lbm.h"
+#include "ocl.h"
 
 /*############################################################################*/
 
@@ -27,287 +27,296 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-    MAIN_Param param;
-    int t;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
 
-    OpenCL_Param prm;
+  OpenCL_Param prm;
 
-    struct pb_Parameters* params;
-    params = pb_ReadParameters(&nArgs, arg);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
+  // Setup TEMP datastructures
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
 
-    //Setup TEMP datastructures
-    MAIN_parseCommandLine( nArgs, arg, &param, params );
-    MAIN_printInfo( &param );
+  /*MAIN_initialize( &param, &prm ); */ // This has been inlined
 
-    /*MAIN_initialize( &param, &prm ); */ // This has been inlined
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    if( param.obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename );
-    }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  if (param.obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
- 
-    OpenCL_initialize(&prm);
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    //Setup DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid );
+  OpenCL_initialize(&prm);
 
-    //Initialize DEVICE datastructures
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid );
+  // Setup DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid);
 
+  // Initialize DEVICE datastructures
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-    clFinish(prm.clCommandQueue);
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  clFinish(prm.clCommandQueue);
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
-    for( t = 1; t <= param.nTimeSteps; t++ ) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-        OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-        /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-        LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+    OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
 
-        /*if( (t & 63) == 0 ) {*/
-            /*printf( "timestep: %i\n", t );*/
+    /*if( (t & 63) == 0 ) {*/
+    /*printf( "timestep: %i\n", t );*/
 #if 0
             CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
             LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-        /*}*/
-    }
-    clFinish(prm.clCommandQueue);
-    /*MAIN_finalize( &param, &prm );*/ // inlined
+    /*}*/
+  }
+  clFinish(prm.clCommandQueue);
+  /*MAIN_finalize( &param, &prm );*/ // inlined
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm.clProgram);
-    clReleaseKernel(prm.clKernel);
-    clReleaseCommandQueue(prm.clCommandQueue);
-    clReleaseContext(prm.clContext);
+  clReleaseProgram(prm.clProgram);
+  clReleaseKernel(prm.clKernel);
+  clReleaseCommandQueue(prm.clCommandQueue);
+  clReleaseContext(prm.clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-    LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
+  LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 
-    pb_FreeParameters(params);
-    return 0;
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-    struct stat fileStat;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
 
-    if( nArgs < 2 ) {
-        printf( "syntax: lbm <time steps>\n" );
-        exit( 1 );
-    }
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
 
-    param->nTimeSteps     = atoi( arg[1] );
-
-    if( params->inpFiles[0] != NULL ) {
-        param->obstacleFilename = params->inpFiles[0];
-
-        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-                    param->obstacleFilename );
-            exit( 1 );
-        }
-        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-            printf( "MAIN_parseCommandLine:\n"
-                    "\tsize of file '%s' is %i bytes\n"
-                    "\texpected size is %i bytes\n",
-                    param->obstacleFilename, (int) fileStat.st_size,
-                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-            exit( 1 );
-        }
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
     }
-    else param->obstacleFilename = NULL;
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
+    }
+  } else
+    param->obstacleFilename = NULL;
 
-    param->resultFilename = params->outFile;
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-    printf( "MAIN_printInfo:\n"
-            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-            "\tnTimeSteps     : %i\n"
-            "\tresult file    : %s\n"
-            "\taction         : %s\n"
-            "\tsimulation type: %s\n"
-            "\tobstacle file  : %s\n\n",
-            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-            param->nTimeSteps, param->resultFilename,
-            "store", "lid-driven cavity",
-            (param->obstacleFilename == NULL) ? "<none>" :
-            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if( param->obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-    }
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-    //Setup DEVICE datastructures
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
+  // Setup DEVICE datastructures
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
 
-    //Initialize DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
+  // Initialize DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_Grid TEMP_srcGrid;
 
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm->clProgram);
-    clReleaseKernel(prm->clKernel);
-    clReleaseCommandQueue(prm->clCommandQueue);
-    clReleaseContext(prm->clContext);
+  clReleaseProgram(prm->clProgram);
+  clReleaseKernel(prm->clKernel);
+  clReleaseCommandQueue(prm->clCommandQueue);
+  clReleaseContext(prm->clContext);
 }
 
-void OpenCL_initialize(OpenCL_Param* prm)
-{
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-    prm->clPlatform = clPlatform[1];
-
-    prm->clCps[0] = CL_CONTEXT_PLATFORM;
-    prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-    prm->clCps[2] = 0;
-
-    clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_CPU,1,&(prm->clDevice),NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    /*printf("Device id = %p\n", prm->clDevice);*/
-    /*cl_device_partition_property props[4];*/
-    /*props[0] = CL_DEVICE_PARTITION_BY_COUNTS;*/
-    /*props[1] = NUM_CORES;*/
-    /*props[1] = 8;*/
-    /*props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;*/
-    /*props[3] = 0;*/
-    /*cl_device_id subdevice_id;*/
-    /*cl_uint num_entries = 1;*/
-
-    /*cl_uint numDevices;*/
-    /*clCreateSubDevices(prm->clDevice, clCps, num_entries, &subdevice_id, &numDevices);*/
-    /*printf("Num of devices = %d\n", numDevices);*/
-    /*for(unsigned i =0 ; i< numDevices; i++)*/
-      /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/
-    /*prm->clDevice = subdevice_id;*/
-
-    /*printf("Device id = %p\n", prm->clDevice);*/
-    prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    prm->clContext = clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
-
-    const unsigned char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")};
-
-    prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
-    /*size_t binarySize = 39303;*/
-    /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, &binarySize, &clSource[0], NULL,&clStatus);*/
-    CHECK_ERROR("clCreateProgramWithSource")
-
-    char clOptions[100];
-    sprintf(clOptions,"-I src/opencl_nvidia");
-
-    clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
-    CHECK_ERROR("clBuildProgram")
-
-    prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-    free((void*)clSource[0]);
-
-    /*pb_CreateAndBuildKernelFromBinary("build/opencl_cpu_baseline_default/kernel.ir", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);*/
+void OpenCL_initialize(OpenCL_Param *prm) {
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+  prm->clPlatform = clPlatform[1];
+
+  prm->clCps[0] = CL_CONTEXT_PLATFORM;
+  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+  prm->clCps[2] = 0;
+
+  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_CPU, 1,
+                            &(prm->clDevice), NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  /*printf("Device id = %p\n", prm->clDevice);*/
+  /*cl_device_partition_property props[4];*/
+  /*props[0] = CL_DEVICE_PARTITION_BY_COUNTS;*/
+  /*props[1] = NUM_CORES;*/
+  /*props[1] = 8;*/
+  /*props[2] = CL_DEVICE_PARTITION_BY_COUNTS_LIST_END;*/
+  /*props[3] = 0;*/
+  /*cl_device_id subdevice_id;*/
+  /*cl_uint num_entries = 1;*/
+
+  /*cl_uint numDevices;*/
+  /*clCreateSubDevices(prm->clDevice, clCps, num_entries, &subdevice_id,
+   * &numDevices);*/
+  /*printf("Num of devices = %d\n", numDevices);*/
+  /*for(unsigned i =0 ; i< numDevices; i++)*/
+  /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/
+  /*prm->clDevice = subdevice_id;*/
+
+  /*printf("Device id = %p\n", prm->clDevice);*/
+  prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_CPU, NULL,
+                                           NULL, &clStatus);
+  prm->clContext =
+      clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  prm->clCommandQueue = clCreateCommandQueue(
+      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+
+  const unsigned char *clSource[] = {
+      readFile("src/opencl_cpu_baseline/kernel.cl")};
+
+  prm->clProgram =
+      clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
+  /*size_t binarySize = 39303;*/
+  /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice,
+   * &binarySize, &clSource[0], NULL,&clStatus);*/
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[100];
+  sprintf(clOptions, "-I src/opencl_nvidia");
+
+  clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions,
+                            NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  prm->clKernel =
+      clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  free((void *)clSource[0]);
+
+  /*pb_CreateAndBuildKernelFromBinary("build/opencl_cpu_baseline_default/kernel.ir",
+   * "performStreamCollide_kernel", &prm->clContext, &prm->clDevice,
+   * &prm->clProgram, &prm->clKernel);*/
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h
index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/main.h
@@ -12,19 +12,20 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
 
-void OpenCL_initialize(OpenCL_Param* prm);
+void OpenCL_initialize(OpenCL_Param *prm);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c
index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.c
@@ -1,40 +1,36 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "ocl.h"
 
-char* readFile(char* fileName)
-{
-	FILE* fp;
-	fp = fopen(fileName,"r");
+char *readFile(char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
 
-	if(fp == NULL)
-	{
-		printf("Error 1!\n");
-		return NULL;
-	}
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    return NULL;
+  }
 
-	fseek(fp,0,SEEK_END);
-	long size = ftell(fp);
-	rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-	char* buffer = malloc(sizeof(char)*(size+1));
-	if(buffer == NULL)
-	{
-		printf("Error 2!\n");
-		fclose(fp);
-		return NULL;
-	}
+  char *buffer = malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	size_t res = fread(buffer,1,size,fp);
-	if(res != size)
-	{
-		printf("Error 3!\n");
-		fclose(fp);
-		return NULL;
-	}
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	buffer[size] = 0;
-	fclose(fp);
-	return buffer;
-}	
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
+}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h
index c7a93a636ea59f77e59a61032b68ad8c15477511..d5011fdcf889fb729689b2a9bf08d76e6c828f10 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_baseline/ocl.h
@@ -2,24 +2,22 @@
 #define __OCLH__
 
 typedef struct {
-	cl_platform_id clPlatform;
-	cl_context_properties clCps[3];
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_program clProgram;
-	cl_kernel clKernel;
+  cl_platform_id clPlatform;
+  cl_context_properties clCps[3];
+  cl_device_id clDevice;
+  cl_context clContext;
+  cl_command_queue clCommandQueue;
+  cl_program clProgram;
+  cl_kernel clKernel;
 } OpenCL_Param;
 
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s! Errcode = %d\n",errorMessage, clStatus);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s! Errcode = %d\n", errorMessage, clStatus);               \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-char* readFile(char*);
+char *readFile(char *);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h
index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/layout_config.h
@@ -13,31 +13,31 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
 //  Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -46,22 +46,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c
index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.c
@@ -10,338 +10,312 @@
 
 // includes, system
 #include <CL/cl.h>
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "ocl.h"
-#include "lbm.h"
 
 /******************************************************************************/
 
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
-	 
-	cl_int clStatus;
-
-	clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
-	CHECK_ERROR("clSetKernelArg")
-	
-	size_t dimBlock[3] = {SIZE_X,1,1};
-	size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-	clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); 
-	CHECK_ERROR("clEnqueueNDRangeKernel") 	
-	
-	clStatus = clFinish(prm->clCommandQueue);
-	CHECK_ERROR("clFinish")
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid) {
+
+  cl_int clStatus;
+
+  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  size_t dimBlock[3] = {SIZE_X, 1, 1};
+  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
+                                    dimGrid, dimBlock, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(prm->clCommandQueue);
+  CHECK_ERROR("clFinish")
 }
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  memset(*ptr, 0, size);
 
-	memset( *ptr, 0, size );
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-	cl_int clStatus;
-	*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  *ptr =
+      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_freeGrid(cl_mem ptr) {
-	clReleaseMemObject(ptr);
-}
+void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); 
-	cl_int clStatus;
-	clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                  h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
 }
 
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-        cl_int clStatus;
-        clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                 h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
-	cl_mem aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
+  cl_mem aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h
index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm.h
@@ -13,23 +13,26 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* OpenCL *********************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
-void OpenCL_LBM_freeGrid( cl_mem ptr );
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
+void OpenCL_LBM_freeGrid(cl_mem ptr);
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid);
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid);
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h
index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c
index 168cc12fd63245b6a04d3f8468d7b3fe463187db..d93a919df300c520c7105612cc54f9684f052678 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.c
@@ -15,10 +15,10 @@
 #include <sys/stat.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
-#include "ocl.h"
 #include "main.h"
-#include "lbm.h"
+#include "ocl.h"
 
 /*############################################################################*/
 
@@ -27,286 +27,294 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-    MAIN_Param param;
-    int t;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
 
-    OpenCL_Param prm;
+  OpenCL_Param prm;
 
-    struct pb_Parameters* params;
-    params = pb_ReadParameters(&nArgs, arg);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
+  // Setup TEMP datastructures
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
 
-    //Setup TEMP datastructures
-    MAIN_parseCommandLine( nArgs, arg, &param, params );
-    MAIN_printInfo( &param );
+  /*MAIN_initialize( &param, &prm ); */ // This has been inlined
 
-    /*MAIN_initialize( &param, &prm ); */ // This has been inlined
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    if( param.obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename );
-    }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  if (param.obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
- 
-    OpenCL_initialize(&prm);
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
 
-    //Setup DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid );
+  OpenCL_initialize(&prm);
 
-    //Initialize DEVICE datastructures
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid );
+  // Setup DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid);
 
+  // Initialize DEVICE datastructures
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
-    for( t = 1; t <= param.nTimeSteps; t++ ) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-        OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-        /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-        LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+    OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
 
-        /*if( (t & 63) == 0 ) {*/
-            /*printf( "timestep: %i\n", t );*/
+    /*if( (t & 63) == 0 ) {*/
+    /*printf( "timestep: %i\n", t );*/
 #if 0
             CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
             LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-        /*}*/
-    }
+    /*}*/
+  }
 
-    /*MAIN_finalize( &param, &prm );*/ // inlined
+  /*MAIN_finalize( &param, &prm );*/ // inlined
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm.clProgram);
-    clReleaseKernel(prm.clKernel);
-    clReleaseCommandQueue(prm.clCommandQueue);
-    clReleaseContext(prm.clContext);
+  clReleaseProgram(prm.clProgram);
+  clReleaseKernel(prm.clKernel);
+  clReleaseCommandQueue(prm.clCommandQueue);
+  clReleaseContext(prm.clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-    LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
+  LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 
-    pb_FreeParameters(params);
-    return 0;
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-    struct stat fileStat;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
 
-    if( nArgs < 2 ) {
-        printf( "syntax: lbm <time steps>\n" );
-        exit( 1 );
-    }
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
 
-    param->nTimeSteps     = atoi( arg[1] );
-
-    if( params->inpFiles[0] != NULL ) {
-        param->obstacleFilename = params->inpFiles[0];
-
-        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-                    param->obstacleFilename );
-            exit( 1 );
-        }
-        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-            printf( "MAIN_parseCommandLine:\n"
-                    "\tsize of file '%s' is %i bytes\n"
-                    "\texpected size is %i bytes\n",
-                    param->obstacleFilename, (int) fileStat.st_size,
-                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-            exit( 1 );
-        }
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
     }
-    else param->obstacleFilename = NULL;
+  } else
+    param->obstacleFilename = NULL;
 
-    param->resultFilename = params->outFile;
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-    printf( "MAIN_printInfo:\n"
-            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-            "\tnTimeSteps     : %i\n"
-            "\tresult file    : %s\n"
-            "\taction         : %s\n"
-            "\tsimulation type: %s\n"
-            "\tobstacle file  : %s\n\n",
-            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-            param->nTimeSteps, param->resultFilename,
-            "store", "lid-driven cavity",
-            (param->obstacleFilename == NULL) ? "<none>" :
-            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if( param->obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    //Setup DEVICE datastructures
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
-
-    //Initialize DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
-
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  // Setup DEVICE datastructures
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
+
+  // Initialize DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
+
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_Grid TEMP_srcGrid;
 
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm->clProgram);
-    clReleaseKernel(prm->clKernel);
-    clReleaseCommandQueue(prm->clCommandQueue);
-    clReleaseContext(prm->clContext);
+  clReleaseProgram(prm->clProgram);
+  clReleaseKernel(prm->clKernel);
+  clReleaseCommandQueue(prm->clCommandQueue);
+  clReleaseContext(prm->clContext);
 }
 
-void OpenCL_initialize(OpenCL_Param* prm)
-{
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-    prm->clPlatform = clPlatform[1];
-
-    prm->clCps[0] = CL_CONTEXT_PLATFORM;
-    prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-    prm->clCps[2] = 0;
-
-    clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_CPU,1,&(prm->clDevice),NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    /*printf("Device id = %p\n", prm->clDevice);*/
-    /*cl_device_partition_property props[3];*/
-    /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/
-    /*props[1] = 1;*/
-    /*props[2] = 0;*/
-    /*cl_device_id subdevice_id[8];*/
-    /*cl_uint num_entries = 8;*/
-
-    /*cl_uint numDevices;*/
-    /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id, &numDevices);*/
-    /*printf("Num of devices = %d\n", numDevices);*/
-    /*for(unsigned i =0 ; i< numDevices; i++)*/
-      /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/
-    /*prm->clDevice = subdevice_id[0];*/
-
-    /*printf("Device id = %p\n", prm->clDevice);*/
-    /*prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/
-    prm->clContext = clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
-
-    /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/
-
-    /*prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/
-    /*size_t binarySize = 39303;*/
-    /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, &binarySize, &clSource[0], NULL,&clStatus);*/
-    /*CHECK_ERROR("clCreateProgramWithSource")*/
-
-    /*char clOptions[100];*/
-    /*sprintf(clOptions,"-I src/opencl_nvidia");*/
-
-    /*clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/
-    /*CHECK_ERROR("clBuildProgram")*/
-
-    /*prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/
-    /*CHECK_ERROR("clCreateKernel")*/
-
-    /*free((void*)clSource[0]);*/
-
-    pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);
+void OpenCL_initialize(OpenCL_Param *prm) {
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+  prm->clPlatform = clPlatform[1];
+
+  prm->clCps[0] = CL_CONTEXT_PLATFORM;
+  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+  prm->clCps[2] = 0;
+
+  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_CPU, 1,
+                            &(prm->clDevice), NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  /*printf("Device id = %p\n", prm->clDevice);*/
+  /*cl_device_partition_property props[3];*/
+  /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/
+  /*props[1] = 1;*/
+  /*props[2] = 0;*/
+  /*cl_device_id subdevice_id[8];*/
+  /*cl_uint num_entries = 8;*/
+
+  /*cl_uint numDevices;*/
+  /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id,
+   * &numDevices);*/
+  /*printf("Num of devices = %d\n", numDevices);*/
+  /*for(unsigned i =0 ; i< numDevices; i++)*/
+  /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/
+  /*prm->clDevice = subdevice_id[0];*/
+
+  /*printf("Device id = %p\n", prm->clDevice);*/
+  /*prm->clContext =
+   * clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/
+  prm->clContext =
+      clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  prm->clCommandQueue = clCreateCommandQueue(
+      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+
+  /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/
+
+  /*prm->clProgram =
+   * clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/
+  /*size_t binarySize = 39303;*/
+  /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice,
+   * &binarySize, &clSource[0], NULL,&clStatus);*/
+  /*CHECK_ERROR("clCreateProgramWithSource")*/
+
+  /*char clOptions[100];*/
+  /*sprintf(clOptions,"-I src/opencl_nvidia");*/
+
+  /*clStatus =
+   * clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/
+  /*CHECK_ERROR("clBuildProgram")*/
+
+  /*prm->clKernel =
+   * clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/
+  /*CHECK_ERROR("clCreateKernel")*/
+
+  /*free((void*)clSource[0]);*/
+
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel",
+                                    &prm->clContext, &prm->clDevice,
+                                    &prm->clProgram, &prm->clKernel);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h
index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/main.h
@@ -12,19 +12,20 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
 
-void OpenCL_initialize(OpenCL_Param* prm);
+void OpenCL_initialize(OpenCL_Param *prm);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c
index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.c
@@ -1,40 +1,36 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "ocl.h"
 
-char* readFile(char* fileName)
-{
-	FILE* fp;
-	fp = fopen(fileName,"r");
+char *readFile(char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
 
-	if(fp == NULL)
-	{
-		printf("Error 1!\n");
-		return NULL;
-	}
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    return NULL;
+  }
 
-	fseek(fp,0,SEEK_END);
-	long size = ftell(fp);
-	rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-	char* buffer = malloc(sizeof(char)*(size+1));
-	if(buffer == NULL)
-	{
-		printf("Error 2!\n");
-		fclose(fp);
-		return NULL;
-	}
+  char *buffer = malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	size_t res = fread(buffer,1,size,fp);
-	if(res != size)
-	{
-		printf("Error 3!\n");
-		fclose(fp);
-		return NULL;
-	}
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	buffer[size] = 0;
-	fclose(fp);
-	return buffer;
-}	
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
+}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h
index c7a93a636ea59f77e59a61032b68ad8c15477511..d5011fdcf889fb729689b2a9bf08d76e6c828f10 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_long/ocl.h
@@ -2,24 +2,22 @@
 #define __OCLH__
 
 typedef struct {
-	cl_platform_id clPlatform;
-	cl_context_properties clCps[3];
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_program clProgram;
-	cl_kernel clKernel;
+  cl_platform_id clPlatform;
+  cl_context_properties clCps[3];
+  cl_device_id clDevice;
+  cl_context clContext;
+  cl_command_queue clCommandQueue;
+  cl_program clProgram;
+  cl_kernel clKernel;
 } OpenCL_Param;
 
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s! Errcode = %d\n",errorMessage, clStatus);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s! Errcode = %d\n", errorMessage, clStatus);               \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-char* readFile(char*);
+char *readFile(char *);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h
index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/layout_config.h
@@ -13,31 +13,31 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
 //  Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -46,22 +46,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c
index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.c
@@ -10,338 +10,312 @@
 
 // includes, system
 #include <CL/cl.h>
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "ocl.h"
-#include "lbm.h"
 
 /******************************************************************************/
 
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
-	 
-	cl_int clStatus;
-
-	clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
-	CHECK_ERROR("clSetKernelArg")
-	
-	size_t dimBlock[3] = {SIZE_X,1,1};
-	size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-	clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); 
-	CHECK_ERROR("clEnqueueNDRangeKernel") 	
-	
-	clStatus = clFinish(prm->clCommandQueue);
-	CHECK_ERROR("clFinish")
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid) {
+
+  cl_int clStatus;
+
+  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  size_t dimBlock[3] = {SIZE_X, 1, 1};
+  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
+                                    dimGrid, dimBlock, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(prm->clCommandQueue);
+  CHECK_ERROR("clFinish")
 }
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  memset(*ptr, 0, size);
 
-	memset( *ptr, 0, size );
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-	cl_int clStatus;
-	*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  *ptr =
+      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_freeGrid(cl_mem ptr) {
-	clReleaseMemObject(ptr);
-}
+void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); 
-	cl_int clStatus;
-	clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                  h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
 }
 
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-        cl_int clStatus;
-        clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                 h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
-	cl_mem aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
+  cl_mem aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h
index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm.h
@@ -13,23 +13,26 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* OpenCL *********************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
-void OpenCL_LBM_freeGrid( cl_mem ptr );
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
+void OpenCL_LBM_freeGrid(cl_mem ptr);
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid);
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid);
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h
index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c
index 168cc12fd63245b6a04d3f8468d7b3fe463187db..d93a919df300c520c7105612cc54f9684f052678 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.c
@@ -15,10 +15,10 @@
 #include <sys/stat.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
-#include "ocl.h"
 #include "main.h"
-#include "lbm.h"
+#include "ocl.h"
 
 /*############################################################################*/
 
@@ -27,286 +27,294 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-    MAIN_Param param;
-    int t;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
 
-    OpenCL_Param prm;
+  OpenCL_Param prm;
 
-    struct pb_Parameters* params;
-    params = pb_ReadParameters(&nArgs, arg);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
+  // Setup TEMP datastructures
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
 
-    //Setup TEMP datastructures
-    MAIN_parseCommandLine( nArgs, arg, &param, params );
-    MAIN_printInfo( &param );
+  /*MAIN_initialize( &param, &prm ); */ // This has been inlined
 
-    /*MAIN_initialize( &param, &prm ); */ // This has been inlined
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    if( param.obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename );
-    }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  if (param.obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
- 
-    OpenCL_initialize(&prm);
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
 
-    //Setup DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid );
+  OpenCL_initialize(&prm);
 
-    //Initialize DEVICE datastructures
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid );
+  // Setup DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid);
 
+  // Initialize DEVICE datastructures
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
-    for( t = 1; t <= param.nTimeSteps; t++ ) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-        OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-        /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-        LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+    OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
+    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+    LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
 
-        /*if( (t & 63) == 0 ) {*/
-            /*printf( "timestep: %i\n", t );*/
+    /*if( (t & 63) == 0 ) {*/
+    /*printf( "timestep: %i\n", t );*/
 #if 0
             CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
             LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-        /*}*/
-    }
+    /*}*/
+  }
 
-    /*MAIN_finalize( &param, &prm );*/ // inlined
+  /*MAIN_finalize( &param, &prm );*/ // inlined
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm.clProgram);
-    clReleaseKernel(prm.clKernel);
-    clReleaseCommandQueue(prm.clCommandQueue);
-    clReleaseContext(prm.clContext);
+  clReleaseProgram(prm.clProgram);
+  clReleaseKernel(prm.clKernel);
+  clReleaseCommandQueue(prm.clCommandQueue);
+  clReleaseContext(prm.clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-    LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
+  LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 
-    pb_FreeParameters(params);
-    return 0;
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-    struct stat fileStat;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
 
-    if( nArgs < 2 ) {
-        printf( "syntax: lbm <time steps>\n" );
-        exit( 1 );
-    }
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
 
-    param->nTimeSteps     = atoi( arg[1] );
-
-    if( params->inpFiles[0] != NULL ) {
-        param->obstacleFilename = params->inpFiles[0];
-
-        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-                    param->obstacleFilename );
-            exit( 1 );
-        }
-        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-            printf( "MAIN_parseCommandLine:\n"
-                    "\tsize of file '%s' is %i bytes\n"
-                    "\texpected size is %i bytes\n",
-                    param->obstacleFilename, (int) fileStat.st_size,
-                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-            exit( 1 );
-        }
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
     }
-    else param->obstacleFilename = NULL;
+  } else
+    param->obstacleFilename = NULL;
 
-    param->resultFilename = params->outFile;
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-    printf( "MAIN_printInfo:\n"
-            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-            "\tnTimeSteps     : %i\n"
-            "\tresult file    : %s\n"
-            "\taction         : %s\n"
-            "\tsimulation type: %s\n"
-            "\tobstacle file  : %s\n\n",
-            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-            param->nTimeSteps, param->resultFilename,
-            "store", "lid-driven cavity",
-            (param->obstacleFilename == NULL) ? "<none>" :
-            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if( param->obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    //Setup DEVICE datastructures
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
-
-    //Initialize DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
-
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  // Setup DEVICE datastructures
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
+
+  // Initialize DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
+
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_Grid TEMP_srcGrid;
 
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm->clProgram);
-    clReleaseKernel(prm->clKernel);
-    clReleaseCommandQueue(prm->clCommandQueue);
-    clReleaseContext(prm->clContext);
+  clReleaseProgram(prm->clProgram);
+  clReleaseKernel(prm->clKernel);
+  clReleaseCommandQueue(prm->clCommandQueue);
+  clReleaseContext(prm->clContext);
 }
 
-void OpenCL_initialize(OpenCL_Param* prm)
-{
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-    prm->clPlatform = clPlatform[1];
-
-    prm->clCps[0] = CL_CONTEXT_PLATFORM;
-    prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-    prm->clCps[2] = 0;
-
-    clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_CPU,1,&(prm->clDevice),NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    /*printf("Device id = %p\n", prm->clDevice);*/
-    /*cl_device_partition_property props[3];*/
-    /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/
-    /*props[1] = 1;*/
-    /*props[2] = 0;*/
-    /*cl_device_id subdevice_id[8];*/
-    /*cl_uint num_entries = 8;*/
-
-    /*cl_uint numDevices;*/
-    /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id, &numDevices);*/
-    /*printf("Num of devices = %d\n", numDevices);*/
-    /*for(unsigned i =0 ; i< numDevices; i++)*/
-      /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/
-    /*prm->clDevice = subdevice_id[0];*/
-
-    /*printf("Device id = %p\n", prm->clDevice);*/
-    /*prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/
-    prm->clContext = clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
-
-    /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/
-
-    /*prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/
-    /*size_t binarySize = 39303;*/
-    /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice, &binarySize, &clSource[0], NULL,&clStatus);*/
-    /*CHECK_ERROR("clCreateProgramWithSource")*/
-
-    /*char clOptions[100];*/
-    /*sprintf(clOptions,"-I src/opencl_nvidia");*/
-
-    /*clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/
-    /*CHECK_ERROR("clBuildProgram")*/
-
-    /*prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/
-    /*CHECK_ERROR("clCreateKernel")*/
-
-    /*free((void*)clSource[0]);*/
-
-    pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);
+void OpenCL_initialize(OpenCL_Param *prm) {
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+  prm->clPlatform = clPlatform[1];
+
+  prm->clCps[0] = CL_CONTEXT_PLATFORM;
+  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+  prm->clCps[2] = 0;
+
+  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_CPU, 1,
+                            &(prm->clDevice), NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  /*printf("Device id = %p\n", prm->clDevice);*/
+  /*cl_device_partition_property props[3];*/
+  /*props[0] = CL_DEVICE_PARTITION_EQUALLY;*/
+  /*props[1] = 1;*/
+  /*props[2] = 0;*/
+  /*cl_device_id subdevice_id[8];*/
+  /*cl_uint num_entries = 8;*/
+
+  /*cl_uint numDevices;*/
+  /*clCreateSubDevices(prm->clDevice, props, num_entries, subdevice_id,
+   * &numDevices);*/
+  /*printf("Num of devices = %d\n", numDevices);*/
+  /*for(unsigned i =0 ; i< numDevices; i++)*/
+  /*printf("Subdevice id %d = %p\n", i, subdevice_id[i]);*/
+  /*prm->clDevice = subdevice_id[0];*/
+
+  /*printf("Device id = %p\n", prm->clDevice);*/
+  /*prm->clContext =
+   * clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);*/
+  prm->clContext =
+      clCreateContext(prm->clCps, 1, &prm->clDevice, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  prm->clCommandQueue = clCreateCommandQueue(
+      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+
+  /*const unsigned char* clSource[] = {readFile("kernel.ir")};*/
+
+  /*prm->clProgram =
+   * clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);*/
+  /*size_t binarySize = 39303;*/
+  /*prm->clProgram = clCreateProgramWithBinary(prm->clContext,1, &prm->clDevice,
+   * &binarySize, &clSource[0], NULL,&clStatus);*/
+  /*CHECK_ERROR("clCreateProgramWithSource")*/
+
+  /*char clOptions[100];*/
+  /*sprintf(clOptions,"-I src/opencl_nvidia");*/
+
+  /*clStatus =
+   * clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);*/
+  /*CHECK_ERROR("clBuildProgram")*/
+
+  /*prm->clKernel =
+   * clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);*/
+  /*CHECK_ERROR("clCreateKernel")*/
+
+  /*free((void*)clSource[0]);*/
+
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "performStreamCollide_kernel",
+                                    &prm->clContext, &prm->clDevice,
+                                    &prm->clProgram, &prm->clKernel);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h
index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/main.h
@@ -12,19 +12,20 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
 
-void OpenCL_initialize(OpenCL_Param* prm);
+void OpenCL_initialize(OpenCL_Param *prm);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c
index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.c
@@ -1,40 +1,36 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "ocl.h"
 
-char* readFile(char* fileName)
-{
-	FILE* fp;
-	fp = fopen(fileName,"r");
+char *readFile(char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
 
-	if(fp == NULL)
-	{
-		printf("Error 1!\n");
-		return NULL;
-	}
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    return NULL;
+  }
 
-	fseek(fp,0,SEEK_END);
-	long size = ftell(fp);
-	rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-	char* buffer = malloc(sizeof(char)*(size+1));
-	if(buffer == NULL)
-	{
-		printf("Error 2!\n");
-		fclose(fp);
-		return NULL;
-	}
+  char *buffer = malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	size_t res = fread(buffer,1,size,fp);
-	if(res != size)
-	{
-		printf("Error 3!\n");
-		fclose(fp);
-		return NULL;
-	}
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	buffer[size] = 0;
-	fclose(fp);
-	return buffer;
-}	
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
+}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h
index c7a93a636ea59f77e59a61032b68ad8c15477511..d5011fdcf889fb729689b2a9bf08d76e6c828f10 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_cpu_short/ocl.h
@@ -2,24 +2,22 @@
 #define __OCLH__
 
 typedef struct {
-	cl_platform_id clPlatform;
-	cl_context_properties clCps[3];
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_program clProgram;
-	cl_kernel clKernel;
+  cl_platform_id clPlatform;
+  cl_context_properties clCps[3];
+  cl_device_id clDevice;
+  cl_context clContext;
+  cl_command_queue clCommandQueue;
+  cl_program clProgram;
+  cl_kernel clKernel;
 } OpenCL_Param;
 
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s! Errcode = %d\n",errorMessage, clStatus);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s! Errcode = %d\n", errorMessage, clStatus);               \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-char* readFile(char*);
+char *readFile(char *);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h
index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/layout_config.h
@@ -13,31 +13,31 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
 //  Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -46,22 +46,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c
index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.c
@@ -10,338 +10,312 @@
 
 // includes, system
 #include <CL/cl.h>
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "ocl.h"
-#include "lbm.h"
 
 /******************************************************************************/
 
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
-	 
-	cl_int clStatus;
-
-	clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
-	CHECK_ERROR("clSetKernelArg")
-	
-	size_t dimBlock[3] = {SIZE_X,1,1};
-	size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-	clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); 
-	CHECK_ERROR("clEnqueueNDRangeKernel") 	
-	
-	clStatus = clFinish(prm->clCommandQueue);
-	CHECK_ERROR("clFinish")
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid) {
+
+  cl_int clStatus;
+
+  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  size_t dimBlock[3] = {SIZE_X, 1, 1};
+  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
+                                    dimGrid, dimBlock, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(prm->clCommandQueue);
+  CHECK_ERROR("clFinish")
 }
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  memset(*ptr, 0, size);
 
-	memset( *ptr, 0, size );
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-	cl_int clStatus;
-	*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  *ptr =
+      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_freeGrid(cl_mem ptr) {
-	clReleaseMemObject(ptr);
-}
+void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); 
-	cl_int clStatus;
-	clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                  h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
 }
 
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-        cl_int clStatus;
-        clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                 h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
-	cl_mem aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
+  cl_mem aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h
index 9dcf4639faf25701b015e0d3e6dcf0f9400b1745..64a617feb862bdffdcb0c6aa57b0f1b09c26debb 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm.h
@@ -13,23 +13,26 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* OpenCL *********************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
-void OpenCL_LBM_freeGrid( cl_mem ptr );
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
+void OpenCL_LBM_freeGrid(cl_mem ptr);
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid);
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid);
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h
index 24fad43205f11da1c05cc8aa5895e7aa2688d3f4..99c50c048a14bb47bb3659b61f088db95706bb0c 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
index 41b22f662a69e29a1b67eaf54bfb2de439becd78..18320b7394e5d499339ee820a992b00acd9b368e 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
@@ -15,10 +15,10 @@
 #include <sys/stat.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
-#include "ocl.h"
 #include "main.h"
-#include "lbm.h"
+#include "ocl.h"
 
 /*############################################################################*/
 
@@ -27,258 +27,262 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-    MAIN_Param param;
-    int t;
-
-    OpenCL_Param prm;
-
-    struct pb_Parameters* params;
-    params = pb_ReadParameters(&nArgs, arg);
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
 
+  OpenCL_Param prm;
 
-    //Setup TEMP datastructures
-    MAIN_parseCommandLine( nArgs, arg, &param, params );
-    MAIN_printInfo( &param );
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
-    /*MAIN_initialize( &param, &prm ); */ // This has been inlined
+  // Setup TEMP datastructures
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
 
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+  /*MAIN_initialize( &param, &prm ); */ // This has been inlined
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    if( param.obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename );
-    }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  if (param.obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
- 
-    OpenCL_initialize(&prm);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    //Setup DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid );
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    //Initialize DEVICE datastructures
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid );
+  OpenCL_initialize(&prm);
 
+  // Setup DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid);
 
+  // Initialize DEVICE datastructures
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
 
-    for( t = 1; t <= param.nTimeSteps; t++ ) {
-        pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-        OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+    OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
 
-        /*if( (t & 63) == 0 ) {*/
-            /*printf( "timestep: %i\n", t );*/
+    /*if( (t & 63) == 0 ) {*/
+    /*printf( "timestep: %i\n", t );*/
 #if 0
             CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
             LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-        /*}*/
-    }
+    /*}*/
+  }
 
-    /*MAIN_finalize( &param, &prm );*/ // inlined
+  /*MAIN_finalize( &param, &prm );*/ // inlined
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm.clProgram);
-    clReleaseKernel(prm.clKernel);
-    clReleaseCommandQueue(prm.clCommandQueue);
-    clReleaseContext(prm.clContext);
+  clReleaseProgram(prm.clProgram);
+  clReleaseKernel(prm.clKernel);
+  clReleaseCommandQueue(prm.clCommandQueue);
+  clReleaseContext(prm.clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-    LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
+  LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 
-    pb_FreeParameters(params);
-    return 0;
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-    struct stat fileStat;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
 
-    if( nArgs < 2 ) {
-        printf( "syntax: lbm <time steps>\n" );
-        exit( 1 );
-    }
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
 
-    param->nTimeSteps     = atoi( arg[1] );
-
-    if( params->inpFiles[0] != NULL ) {
-        param->obstacleFilename = params->inpFiles[0];
-
-        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-                    param->obstacleFilename );
-            exit( 1 );
-        }
-        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-            printf( "MAIN_parseCommandLine:\n"
-                    "\tsize of file '%s' is %i bytes\n"
-                    "\texpected size is %i bytes\n",
-                    param->obstacleFilename, (int) fileStat.st_size,
-                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-            exit( 1 );
-        }
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
     }
-    else param->obstacleFilename = NULL;
+  } else
+    param->obstacleFilename = NULL;
 
-    param->resultFilename = params->outFile;
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-    printf( "MAIN_printInfo:\n"
-            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-            "\tnTimeSteps     : %i\n"
-            "\tresult file    : %s\n"
-            "\taction         : %s\n"
-            "\tsimulation type: %s\n"
-            "\tobstacle file  : %s\n\n",
-            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-            param->nTimeSteps, param->resultFilename,
-            "store", "lid-driven cavity",
-            (param->obstacleFilename == NULL) ? "<none>" :
-            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if( param->obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-    }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
 
-    //Setup DEVICE datastructures
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
 
-    //Initialize DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
+  // Setup DEVICE datastructures
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  // Initialize DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_Grid TEMP_srcGrid;
 
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm->clProgram);
-    clReleaseKernel(prm->clKernel);
-    clReleaseCommandQueue(prm->clCommandQueue);
-    clReleaseContext(prm->clContext);
+  clReleaseProgram(prm->clProgram);
+  clReleaseKernel(prm->clKernel);
+  clReleaseCommandQueue(prm->clCommandQueue);
+  clReleaseContext(prm->clContext);
 }
 
-void OpenCL_initialize(OpenCL_Param* prm)
-{
-    cl_int clStatus;
+void OpenCL_initialize(OpenCL_Param *prm) {
+  cl_int clStatus;
 
-    clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL);
-    CHECK_ERROR("clGetPlatformIDs")
+  clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL);
+  CHECK_ERROR("clGetPlatformIDs")
 
-    prm->clCps[0] = CL_CONTEXT_PLATFORM;
-    prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-    prm->clCps[2] = 0;
+  prm->clCps[0] = CL_CONTEXT_PLATFORM;
+  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+  prm->clCps[2] = 0;
 
-    clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL);
-    CHECK_ERROR("clGetDeviceIDs")
+  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1,
+                            &(prm->clDevice), NULL);
+  CHECK_ERROR("clGetDeviceIDs")
 
-    prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
+  prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL,
+                                           NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
 
-    prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
+  prm->clCommandQueue = clCreateCommandQueue(
+      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
 
-    pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
 
-    const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-    prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
-    CHECK_ERROR("clCreateProgramWithSource")
+  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  prm->clProgram =
+      clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
 
-    char clOptions[100];
-    sprintf(clOptions,"-I src/opencl_nvidia");
+  char clOptions[100];
+  sprintf(clOptions, "-I src/opencl_nvidia");
 
-    clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
-    CHECK_ERROR("clBuildProgram")
+  clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions,
+                            NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
 
-    prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
-    CHECK_ERROR("clCreateKernel")
+  prm->clKernel =
+      clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus);
+  CHECK_ERROR("clCreateKernel")
 
-    free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
-    /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);*/
+  /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s",
+   * "performStreamCollide_kernel", &prm->clContext, &prm->clDevice,
+   * &prm->clProgram, &prm->clKernel);*/
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h
index feee4e8768b13f0975481b1e3a5505ad3cdd018f..9d8e145c93b37488a3826e77b964c56699377d2a 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/main.h
@@ -12,19 +12,20 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
 
-void OpenCL_initialize(OpenCL_Param* prm);
+void OpenCL_initialize(OpenCL_Param *prm);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c
index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.c
@@ -1,40 +1,36 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "ocl.h"
 
-char* readFile(char* fileName)
-{
-	FILE* fp;
-	fp = fopen(fileName,"r");
+char *readFile(char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
 
-	if(fp == NULL)
-	{
-		printf("Error 1!\n");
-		return NULL;
-	}
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    return NULL;
+  }
 
-	fseek(fp,0,SEEK_END);
-	long size = ftell(fp);
-	rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-	char* buffer = malloc(sizeof(char)*(size+1));
-	if(buffer == NULL)
-	{
-		printf("Error 2!\n");
-		fclose(fp);
-		return NULL;
-	}
+  char *buffer = malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	size_t res = fread(buffer,1,size,fp);
-	if(res != size)
-	{
-		printf("Error 3!\n");
-		fclose(fp);
-		return NULL;
-	}
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	buffer[size] = 0;
-	fclose(fp);
-	return buffer;
-}	
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
+}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h
index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia/ocl.h
@@ -2,24 +2,22 @@
 #define __OCLH__
 
 typedef struct {
-	cl_platform_id clPlatform;
-	cl_context_properties clCps[3];
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_program clProgram;
-	cl_kernel clKernel;
+  cl_platform_id clPlatform;
+  cl_context_properties clCps[3];
+  cl_device_id clDevice;
+  cl_context clContext;
+  cl_command_queue clCommandQueue;
+  cl_program clProgram;
+  cl_kernel clKernel;
 } OpenCL_Param;
 
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-char* readFile(char*);
+char *readFile(char *);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h
index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/layout_config.h
@@ -13,31 +13,31 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
 //  Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -46,22 +46,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c
index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.c
@@ -10,338 +10,312 @@
 
 // includes, system
 #include <CL/cl.h>
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "ocl.h"
-#include "lbm.h"
 
 /******************************************************************************/
 
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
-	 
-	cl_int clStatus;
-
-	clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
-	CHECK_ERROR("clSetKernelArg")
-	
-	size_t dimBlock[3] = {SIZE_X,1,1};
-	size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-	clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); 
-	CHECK_ERROR("clEnqueueNDRangeKernel") 	
-	
-	clStatus = clFinish(prm->clCommandQueue);
-	CHECK_ERROR("clFinish")
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid) {
+
+  cl_int clStatus;
+
+  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  size_t dimBlock[3] = {SIZE_X, 1, 1};
+  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
+                                    dimGrid, dimBlock, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(prm->clCommandQueue);
+  CHECK_ERROR("clFinish")
 }
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  memset(*ptr, 0, size);
 
-	memset( *ptr, 0, size );
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-	cl_int clStatus;
-	*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  *ptr =
+      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_freeGrid(cl_mem ptr) {
-	clReleaseMemObject(ptr);
-}
+void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); 
-	cl_int clStatus;
-	clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                  h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
 }
 
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-        cl_int clStatus;
-        clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                 h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
-	cl_mem aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
+  cl_mem aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h
index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm.h
@@ -13,23 +13,26 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* OpenCL *********************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
-void OpenCL_LBM_freeGrid( cl_mem ptr );
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
+void OpenCL_LBM_freeGrid(cl_mem ptr);
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid);
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid);
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h
index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c
index c95d33e0409daa902ec2bc939a10081b99b259a1..5e43b754279910d3ca3b45d40184df666138f9e5 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.c
@@ -15,10 +15,10 @@
 #include <sys/stat.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
-#include "ocl.h"
 #include "main.h"
-#include "lbm.h"
+#include "ocl.h"
 
 /*############################################################################*/
 
@@ -27,259 +27,266 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-    MAIN_Param param;
-    int t;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
 
-    OpenCL_Param prm;
+  OpenCL_Param prm;
 
-    struct pb_Parameters* params;
-    params = pb_ReadParameters(&nArgs, arg);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
+  // Setup TEMP datastructures
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
 
-    //Setup TEMP datastructures
-    MAIN_parseCommandLine( nArgs, arg, &param, params );
-    MAIN_printInfo( &param );
+  /*MAIN_initialize( &param, &prm ); */ // This has been inlined
 
-    /*MAIN_initialize( &param, &prm ); */ // This has been inlined
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  if (param.obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    if( param.obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename );
-    }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
- 
-    OpenCL_initialize(&prm);
-
-    //Setup DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid );
-
-    //Initialize DEVICE datastructures
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for( int i=0; i < 1; i++) {
-      for( t = 1; t <= param.nTimeSteps; t++ ) {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-          LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
-
-          if( (t & 63) == 0 ) {
-              printf( "timestep: %i\n", t );
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  OpenCL_initialize(&prm);
+
+  // Setup DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid);
+
+  // Initialize DEVICE datastructures
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int i = 0; i < 1; i++) {
+    for (t = 1; t <= param.nTimeSteps; t++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+      LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
+
+      if ((t & 63) == 0) {
+        printf("timestep: %i\n", t);
 #if 0
               CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
               LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-          }
       }
     }
-    /*MAIN_finalize( &param, &prm );*/ // inlined
+  }
+  /*MAIN_finalize( &param, &prm );*/ // inlined
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm.clProgram);
-    clReleaseKernel(prm.clKernel);
-    clReleaseCommandQueue(prm.clCommandQueue);
-    clReleaseContext(prm.clContext);
+  clReleaseProgram(prm.clProgram);
+  clReleaseKernel(prm.clKernel);
+  clReleaseCommandQueue(prm.clCommandQueue);
+  clReleaseContext(prm.clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-    LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
+  LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 
-    pb_FreeParameters(params);
-    return 0;
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-    struct stat fileStat;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
 
-    if( nArgs < 2 ) {
-        printf( "syntax: lbm <time steps>\n" );
-        exit( 1 );
-    }
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
 
-    param->nTimeSteps     = atoi( arg[1] );
-
-    if( params->inpFiles[0] != NULL ) {
-        param->obstacleFilename = params->inpFiles[0];
-
-        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-                    param->obstacleFilename );
-            exit( 1 );
-        }
-        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-            printf( "MAIN_parseCommandLine:\n"
-                    "\tsize of file '%s' is %i bytes\n"
-                    "\texpected size is %i bytes\n",
-                    param->obstacleFilename, (int) fileStat.st_size,
-                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-            exit( 1 );
-        }
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
     }
-    else param->obstacleFilename = NULL;
+  } else
+    param->obstacleFilename = NULL;
 
-    param->resultFilename = params->outFile;
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-    printf( "MAIN_printInfo:\n"
-            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-            "\tnTimeSteps     : %i\n"
-            "\tresult file    : %s\n"
-            "\taction         : %s\n"
-            "\tsimulation type: %s\n"
-            "\tobstacle file  : %s\n\n",
-            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-            param->nTimeSteps, param->resultFilename,
-            "store", "lid-driven cavity",
-            (param->obstacleFilename == NULL) ? "<none>" :
-            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if( param->obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    //Setup DEVICE datastructures
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
-
-    //Initialize DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
-
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  // Setup DEVICE datastructures
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
+
+  // Initialize DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
+
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_Grid TEMP_srcGrid;
 
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm->clProgram);
-    clReleaseKernel(prm->clKernel);
-    clReleaseCommandQueue(prm->clCommandQueue);
-    clReleaseContext(prm->clContext);
+  clReleaseProgram(prm->clProgram);
+  clReleaseKernel(prm->clKernel);
+  clReleaseCommandQueue(prm->clCommandQueue);
+  clReleaseContext(prm->clContext);
 }
 
-void OpenCL_initialize(OpenCL_Param* prm)
-{
-    cl_int clStatus;
+void OpenCL_initialize(OpenCL_Param *prm) {
+  cl_int clStatus;
 
-    clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL);
-    CHECK_ERROR("clGetPlatformIDs")
+  clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL);
+  CHECK_ERROR("clGetPlatformIDs")
 
-    prm->clCps[0] = CL_CONTEXT_PLATFORM;
-    prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-    prm->clCps[2] = 0;
+  prm->clCps[0] = CL_CONTEXT_PLATFORM;
+  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+  prm->clCps[2] = 0;
 
-    clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL);
-    CHECK_ERROR("clGetDeviceIDs")
+  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1,
+                            &(prm->clDevice), NULL);
+  CHECK_ERROR("clGetDeviceIDs")
 
-    prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
+  prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL,
+                                           NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
 
-    prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
+  prm->clCommandQueue = clCreateCommandQueue(
+      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
 
-    pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
 
-    //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-    //prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
+  // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  // prm->clProgram =
+  // clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-    //char clOptions[100];
-    //sprintf(clOptions,"-I src/opencl_nvidia");
+  // char clOptions[100];
+  // sprintf(clOptions,"-I src/opencl_nvidia");
 
-    //clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
+  // clStatus =
+  // clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-    //prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
+  // prm->clKernel =
+  // clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_long_default/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_nvidia_long_default/kernel_offline.nvptx.s",
+      "performStreamCollide_kernel", &prm->clContext, &prm->clDevice,
+      &prm->clProgram, &prm->clKernel);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h
index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/main.h
@@ -12,19 +12,20 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
 
-void OpenCL_initialize(OpenCL_Param* prm);
+void OpenCL_initialize(OpenCL_Param *prm);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c
index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.c
@@ -1,40 +1,36 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "ocl.h"
 
-char* readFile(char* fileName)
-{
-	FILE* fp;
-	fp = fopen(fileName,"r");
+char *readFile(char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
 
-	if(fp == NULL)
-	{
-		printf("Error 1!\n");
-		return NULL;
-	}
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    return NULL;
+  }
 
-	fseek(fp,0,SEEK_END);
-	long size = ftell(fp);
-	rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-	char* buffer = malloc(sizeof(char)*(size+1));
-	if(buffer == NULL)
-	{
-		printf("Error 2!\n");
-		fclose(fp);
-		return NULL;
-	}
+  char *buffer = malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	size_t res = fread(buffer,1,size,fp);
-	if(res != size)
-	{
-		printf("Error 3!\n");
-		fclose(fp);
-		return NULL;
-	}
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	buffer[size] = 0;
-	fclose(fp);
-	return buffer;
-}	
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
+}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h
index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_long/ocl.h
@@ -2,24 +2,22 @@
 #define __OCLH__
 
 typedef struct {
-	cl_platform_id clPlatform;
-	cl_context_properties clCps[3];
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_program clProgram;
-	cl_kernel clKernel;
+  cl_platform_id clPlatform;
+  cl_context_properties clCps[3];
+  cl_device_id clDevice;
+  cl_context clContext;
+  cl_command_queue clCommandQueue;
+  cl_program clProgram;
+  cl_kernel clKernel;
 } OpenCL_Param;
 
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-char* readFile(char*);
+char *readFile(char *);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h
index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/layout_config.h
@@ -13,31 +13,31 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
 //  Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -46,22 +46,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c
index aab11ee0cb215bc918cffecf23e97c9eb528b71c..14ffa4211b3763d7c1c6538e693a76be61a0b158 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.c
@@ -10,338 +10,312 @@
 
 // includes, system
 #include <CL/cl.h>
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "ocl.h"
-#include "lbm.h"
 
 /******************************************************************************/
 
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
-	 
-	cl_int clStatus;
-
-	clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
-	CHECK_ERROR("clSetKernelArg")
-	
-	size_t dimBlock[3] = {SIZE_X,1,1};
-	size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-	clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL); 
-	CHECK_ERROR("clEnqueueNDRangeKernel") 	
-	
-	clStatus = clFinish(prm->clCommandQueue);
-	CHECK_ERROR("clFinish")
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid) {
+
+  cl_int clStatus;
+
+  clStatus = clSetKernelArg(prm->clKernel, 0, sizeof(cl_mem), (void *)&srcGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(prm->clKernel, 1, sizeof(cl_mem), (void *)&dstGrid);
+  CHECK_ERROR("clSetKernelArg")
+
+  size_t dimBlock[3] = {SIZE_X, 1, 1};
+  size_t dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue, prm->clKernel, 3, NULL,
+                                    dimGrid, dimBlock, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(prm->clCommandQueue);
+  CHECK_ERROR("clFinish")
 }
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  memset(*ptr, 0, size);
 
-	memset( *ptr, 0, size );
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-	cl_int clStatus;
-	*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  *ptr =
+      clCreateBuffer(prm->clContext, CL_MEM_READ_WRITE, size, NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_freeGrid(cl_mem ptr) {
-	clReleaseMemObject(ptr);
-}
+void OpenCL_LBM_freeGrid(cl_mem ptr) { clReleaseMemObject(ptr); }
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float ); 
-	cl_int clStatus;
-	clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueWriteBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                  h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
 }
 
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
-	const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-        cl_int clStatus;
-        clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  cl_int clStatus;
+  clStatus = clEnqueueReadBuffer(prm->clCommandQueue, d_grid, CL_TRUE, 0, size,
+                                 h_grid - MARGIN, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 }
 
 /*############################################################################*/
 
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
-	cl_mem aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2) {
+  cl_mem aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h
index 8070cf3030305619453064ca9fbf2a4c4a23c24b..b687e8ebad95099908d0d214243b6e290e871cf5 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm.h
@@ -13,23 +13,26 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(cl_mem *grid1, cl_mem *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /* OpenCL *********************************************************************/
 
-void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
-void OpenCL_LBM_freeGrid( cl_mem ptr );
-void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
-void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
+void OpenCL_LBM_allocateGrid(const OpenCL_Param *prm, cl_mem *ptr);
+void OpenCL_LBM_freeGrid(cl_mem ptr);
+void OpenCL_LBM_initializeGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                               LBM_Grid h_grid);
+void OpenCL_LBM_getDeviceGrid(const OpenCL_Param *prm, cl_mem d_grid,
+                              LBM_Grid h_grid);
+void OpenCL_LBM_performStreamCollide(const OpenCL_Param *prm, cl_mem srcGrid,
+                                     cl_mem dstGrid);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h
index 2f8ba8a09c93f68815ec5ce41d18821fa7396e40..d789964063797f77346bfb53eaad3f7ff8695ced 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c
index e07d6946258afd9daca0ec526752c15352620c5c..e66cb2c47cc5bd1f62d774952a7e2397005f1e47 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.c
@@ -15,10 +15,10 @@
 #include <sys/stat.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
-#include "ocl.h"
 #include "main.h"
-#include "lbm.h"
+#include "ocl.h"
 
 /*############################################################################*/
 
@@ -27,259 +27,266 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 /*############################################################################*/
 
 struct pb_TimerSet timers;
-int main( int nArgs, char* arg[] ) {
-    MAIN_Param param;
-    int t;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
 
-    OpenCL_Param prm;
+  OpenCL_Param prm;
 
-    struct pb_Parameters* params;
-    params = pb_ReadParameters(&nArgs, arg);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
+  // Setup TEMP datastructures
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
 
-    //Setup TEMP datastructures
-    MAIN_parseCommandLine( nArgs, arg, &param, params );
-    MAIN_printInfo( &param );
+  /*MAIN_initialize( &param, &prm ); */ // This has been inlined
 
-    /*MAIN_initialize( &param, &prm ); */ // This has been inlined
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
 
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  if (param.obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param.obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param.obstacleFilename);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    if( param.obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param.obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param.obstacleFilename );
-    }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
- 
-    OpenCL_initialize(&prm);
-
-    //Setup DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( &prm, &OpenCL_dstGrid );
-
-    //Initialize DEVICE datastructures
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( &prm, OpenCL_dstGrid, TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for( int i=0; i < 4; i++) {
-      for( t = 1; t <= param.nTimeSteps; t++ ) {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-          LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
-
-          if( (t & 63) == 0 ) {
-              printf( "timestep: %i\n", t );
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  OpenCL_initialize(&prm);
+
+  // Setup DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(&prm, &OpenCL_dstGrid);
+
+  // Initialize DEVICE datastructures
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(&prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int i = 0; i < 4; i++) {
+    for (t = 1; t <= param.nTimeSteps; t++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+      LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
+
+      if ((t & 63) == 0) {
+        printf("timestep: %i\n", t);
 #if 0
               CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
               LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-          }
       }
     }
-    /*MAIN_finalize( &param, &prm );*/ // inlined
+  }
+  /*MAIN_finalize( &param, &prm );*/ // inlined
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(&prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm.clProgram);
-    clReleaseKernel(prm.clKernel);
-    clReleaseCommandQueue(prm.clCommandQueue);
-    clReleaseContext(prm.clContext);
+  clReleaseProgram(prm.clProgram);
+  clReleaseKernel(prm.clKernel);
+  clReleaseCommandQueue(prm.clCommandQueue);
+  clReleaseContext(prm.clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    LBM_showGridStatistics( TEMP_srcGrid );
-    LBM_storeVelocityField( TEMP_srcGrid, param.resultFilename, TRUE );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  LBM_showGridStatistics(TEMP_srcGrid);
+  LBM_storeVelocityField(TEMP_srcGrid, param.resultFilename, TRUE);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 
-    pb_FreeParameters(params);
-    return 0;
+  pb_FreeParameters(params);
+  return 0;
 }
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-    struct stat fileStat;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
 
-    if( nArgs < 2 ) {
-        printf( "syntax: lbm <time steps>\n" );
-        exit( 1 );
-    }
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
 
-    param->nTimeSteps     = atoi( arg[1] );
-
-    if( params->inpFiles[0] != NULL ) {
-        param->obstacleFilename = params->inpFiles[0];
-
-        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-                    param->obstacleFilename );
-            exit( 1 );
-        }
-        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-            printf( "MAIN_parseCommandLine:\n"
-                    "\tsize of file '%s' is %i bytes\n"
-                    "\texpected size is %i bytes\n",
-                    param->obstacleFilename, (int) fileStat.st_size,
-                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-            exit( 1 );
-        }
+  param->nTimeSteps = atoi(arg[1]);
+
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
     }
-    else param->obstacleFilename = NULL;
+  } else
+    param->obstacleFilename = NULL;
 
-    param->resultFilename = params->outFile;
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-    printf( "MAIN_printInfo:\n"
-            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-            "\tnTimeSteps     : %i\n"
-            "\tresult file    : %s\n"
-            "\taction         : %s\n"
-            "\tsimulation type: %s\n"
-            "\tobstacle file  : %s\n\n",
-            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-            param->nTimeSteps, param->resultFilename,
-            "store", "lid-driven cavity",
-            (param->obstacleFilename == NULL) ? "<none>" :
-            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
-    LBM_allocateGrid( (float**) &TEMP_dstGrid );
-    LBM_initializeGrid( TEMP_srcGrid );
-    LBM_initializeGrid( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if( param->obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-        LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-    }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    //Setup DEVICE datastructures
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-    OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
-
-    //Initialize DEVICE datastructures
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-    OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
-
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
-    LBM_freeGrid( (float**) &TEMP_dstGrid );
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
+  LBM_allocateGrid((float **)&TEMP_dstGrid);
+  LBM_initializeGrid(TEMP_srcGrid);
+  LBM_initializeGrid(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
+  LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  // Setup DEVICE datastructures
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
+  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
+
+  // Initialize DEVICE datastructures
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
+
+  LBM_freeGrid((float **)&TEMP_srcGrid);
+  LBM_freeGrid((float **)&TEMP_dstGrid);
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_Grid TEMP_srcGrid;
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_Grid TEMP_srcGrid;
 
-    //Setup TEMP datastructures
-    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+  // Setup TEMP datastructures
+  LBM_allocateGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_showGridStatistics( TEMP_srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_showGridStatistics(TEMP_srcGrid);
 
-    LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+  LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
 
-    LBM_freeGrid( (float**) &TEMP_srcGrid );
+  LBM_freeGrid((float **)&TEMP_srcGrid);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  OpenCL_LBM_freeGrid(OpenCL_srcGrid);
+  OpenCL_LBM_freeGrid(OpenCL_dstGrid);
 
-    clReleaseProgram(prm->clProgram);
-    clReleaseKernel(prm->clKernel);
-    clReleaseCommandQueue(prm->clCommandQueue);
-    clReleaseContext(prm->clContext);
+  clReleaseProgram(prm->clProgram);
+  clReleaseKernel(prm->clKernel);
+  clReleaseCommandQueue(prm->clCommandQueue);
+  clReleaseContext(prm->clContext);
 }
 
-void OpenCL_initialize(OpenCL_Param* prm)
-{
-    cl_int clStatus;
+void OpenCL_initialize(OpenCL_Param *prm) {
+  cl_int clStatus;
 
-    clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL);
-    CHECK_ERROR("clGetPlatformIDs")
+  clStatus = clGetPlatformIDs(1, &(prm->clPlatform), NULL);
+  CHECK_ERROR("clGetPlatformIDs")
 
-    prm->clCps[0] = CL_CONTEXT_PLATFORM;
-    prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-    prm->clCps[2] = 0;
+  prm->clCps[0] = CL_CONTEXT_PLATFORM;
+  prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+  prm->clCps[2] = 0;
 
-    clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL);
-    CHECK_ERROR("clGetDeviceIDs")
+  clStatus = clGetDeviceIDs(prm->clPlatform, CL_DEVICE_TYPE_GPU, 1,
+                            &(prm->clDevice), NULL);
+  CHECK_ERROR("clGetDeviceIDs")
 
-    prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
+  prm->clContext = clCreateContextFromType(prm->clCps, CL_DEVICE_TYPE_GPU, NULL,
+                                           NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
 
-    prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
+  prm->clCommandQueue = clCreateCommandQueue(
+      prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
 
-    pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+  pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
 
-    //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-    //prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
+  // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  // prm->clProgram =
+  // clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-    //char clOptions[100];
-    //sprintf(clOptions,"-I src/opencl_nvidia");
+  // char clOptions[100];
+  // sprintf(clOptions,"-I src/opencl_nvidia");
 
-    //clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
+  // clStatus =
+  // clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-    //prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
+  // prm->clKernel =
+  // clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_short_default/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_nvidia_short_default/kernel_offline.nvptx.s",
+      "performStreamCollide_kernel", &prm->clContext, &prm->clDevice,
+      &prm->clProgram, &prm->clKernel);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h
index 2ca41792bbd8ed8d7596d52e1ef79038935617ca..5f58edc2616cece34c4b3d0467f991d9c4bd93c9 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/main.h
@@ -12,19 +12,20 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
-void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm);
+void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm);
 
-void OpenCL_initialize(OpenCL_Param* prm);
+void OpenCL_initialize(OpenCL_Param *prm);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c
index 78a792924aa1e0ddf0130daba1270da1d36ec116..4f232db0d9776f4f2d0eb4b2444036f35ff27257 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.c
@@ -1,40 +1,36 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include "ocl.h"
 
-char* readFile(char* fileName)
-{
-	FILE* fp;
-	fp = fopen(fileName,"r");
+char *readFile(char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
 
-	if(fp == NULL)
-	{
-		printf("Error 1!\n");
-		return NULL;
-	}
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    return NULL;
+  }
 
-	fseek(fp,0,SEEK_END);
-	long size = ftell(fp);
-	rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-	char* buffer = malloc(sizeof(char)*(size+1));
-	if(buffer == NULL)
-	{
-		printf("Error 2!\n");
-		fclose(fp);
-		return NULL;
-	}
+  char *buffer = malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	size_t res = fread(buffer,1,size,fp);
-	if(res != size)
-	{
-		printf("Error 3!\n");
-		fclose(fp);
-		return NULL;
-	}
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    return NULL;
+  }
 
-	buffer[size] = 0;
-	fclose(fp);
-	return buffer;
-}	
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
+}
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h
index 5a08a6bab9a95fa8c0158741363dd2a5c92a45b7..5d5d984ba698d6ac71af3e51de3e6724a79135aa 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/opencl_nvidia_short/ocl.h
@@ -2,24 +2,22 @@
 #define __OCLH__
 
 typedef struct {
-	cl_platform_id clPlatform;
-	cl_context_properties clCps[3];
-	cl_device_id clDevice;
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_program clProgram;
-	cl_kernel clKernel;
+  cl_platform_id clPlatform;
+  cl_context_properties clCps[3];
+  cl_device_id clDevice;
+  cl_context clContext;
+  cl_command_queue clCommandQueue;
+  cl_program clProgram;
+  cl_kernel clKernel;
 } OpenCL_Param;
 
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-char* readFile(char*);
+char *readFile(char *);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h
index 467c8998b31560b3efe7f94367345db3fb2c958a..d44088661d313eeca6d44612549337b5a2630e04 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/layout_config.h
@@ -13,31 +13,31 @@
 
 /*############################################################################*/
 
-//Unchangeable settings: volume simulation size for the given example
+// Unchangeable settings: volume simulation size for the given example
 #define SIZE_X (120)
 #define SIZE_Y (120)
 #define SIZE_Z (150)
 
-//Changeable settings
-//Padding in each dimension
+// Changeable settings
+// Padding in each dimension
 #define PADDING_X (8)
 #define PADDING_Y (0)
 #define PADDING_Z (4)
 
-//Pitch in each dimension
-#define PADDED_X (SIZE_X+PADDING_X)
-#define PADDED_Y (SIZE_Y+PADDING_Y)
-#define PADDED_Z (SIZE_Z+PADDING_Z)
+// Pitch in each dimension
+#define PADDED_X (SIZE_X + PADDING_X)
+#define PADDED_Y (SIZE_Y + PADDING_Y)
+#define PADDED_Z (SIZE_Z + PADDING_Z)
 
-#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
-#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
+#define TOTAL_CELLS (SIZE_X * SIZE_Y * SIZE_Z)
+#define TOTAL_PADDED_CELLS (PADDED_X * PADDED_Y * PADDED_Z)
 
 //  Flattening function
 //  This macro will be used to map a 3-D index and element to a value
-#define CALC_INDEX(x,y,z,e) ( TOTAL_PADDED_CELLS*e + \
-                               ((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
+#define CALC_INDEX(x, y, z, e)                                                 \
+  (TOTAL_PADDED_CELLS * e + ((x) + (y)*PADDED_X + (z)*PADDED_X * PADDED_Y))
 
-#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
+#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0, 0, 0, 0))
 
 // Set this value to 1 for GATHER, or 0 for SCATTER
 #if 1
@@ -46,22 +46,41 @@
 #define SCATTER
 #endif
 
-//OpenCL block size (not trivially changeable here)
+// OpenCL block size (not trivially changeable here)
 #define BLOCK_SIZE SIZE_X
 
 /*############################################################################*/
 
-typedef enum {C = 0,
-              N, S, E, W, T, B,
-              NE, NW, SE, SW,
-              NT, NB, ST, SB,
-              ET, EB, WT, WB,
-              FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef enum {
+  C = 0,
+  N,
+  S,
+  E,
+  W,
+  T,
+  B,
+  NE,
+  NW,
+  SE,
+  SW,
+  NT,
+  NB,
+  ST,
+  SB,
+  ET,
+  EB,
+  WT,
+  WB,
+  FLAGS,
+  N_CELL_ENTRIES
+} CELL_ENTRIES;
 
 #define N_DISTR_FUNCS FLAGS
 
-typedef enum {OBSTACLE    = 1 << 0,
-              ACCEL       = 1 << 1,
-              IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
+typedef enum {
+  OBSTACLE = 1 << 0,
+  ACCEL = 1 << 1,
+  IN_OUT_FLOW = 1 << 2
+} CELL_FLAGS;
 
 #endif /* _CONFIG_H_ */
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp
index 9b9de7702142f56cfc492aa3d680990a8f707a56..cf00ad76a285f5209ecee541308d5f18ed356249 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.cpp
@@ -9,295 +9,263 @@
 /*############################################################################*/
 
 // includes, system
+#include <float.h>
 #include <math.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
-#include <float.h>
 
 // includes, project
 #include "layout_config.h"
-#include "lbm_macros.h"
 #include "lbm.h"
+#include "lbm_macros.h"
 
 /******************************************************************************/
 
-
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr ) {
-	const size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
+void LBM_allocateGrid(float **ptr) {
+  const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
 
-	*ptr = (float*)malloc( size );
-	if( ! *ptr ) {
-		printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
-				size / (1024.0*1024.0) );
-		exit( 1 );
-	}
+  *ptr = (float *)malloc(size);
+  if (!*ptr) {
+    printf("LBM_allocateGrid: could not allocate %.1f MByte\n",
+           size / (1024.0 * 1024.0));
+    exit(1);
+  }
 
-	memset( *ptr, 0, size );
+  memset(*ptr, 0, size);
 
-	printf( "LBM_allocateGrid: allocated %.1f MByte\n",
-			size / (1024.0*1024.0) );
-	
-	*ptr += MARGIN;
+  printf("LBM_allocateGrid: allocated %.1f MByte\n", size / (1024.0 * 1024.0));
+
+  *ptr += MARGIN;
 }
 
 /******************************************************************************/
 
 /*############################################################################*/
 
-void LBM_freeGrid( float** ptr ) {
-	free( *ptr-MARGIN );
-	*ptr = NULL;
+void LBM_freeGrid(float **ptr) {
+  free(*ptr - MARGIN);
+  *ptr = NULL;
 }
 
 /******************************************************************************/
 
 /*############################################################################*/
 
-void LBM_initializeGrid( LBM_Grid grid ) {
-	SWEEP_VAR
-
-	SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-	SRC_C( grid  ) = DFL1;
-	SRC_N( grid  ) = DFL2;
-	SRC_S( grid  ) = DFL2;
-	SRC_E( grid  ) = DFL2;
-	SRC_W( grid  ) = DFL2;
-	SRC_T( grid  ) = DFL2;
-	SRC_B( grid  ) = DFL2;
-	SRC_NE( grid ) = DFL3;
-	SRC_NW( grid ) = DFL3;
-	SRC_SE( grid ) = DFL3;
-	SRC_SW( grid ) = DFL3;
-	SRC_NT( grid ) = DFL3;
-	SRC_NB( grid ) = DFL3;
-	SRC_ST( grid ) = DFL3;
-	SRC_SB( grid ) = DFL3;
-	SRC_ET( grid ) = DFL3;
-	SRC_EB( grid ) = DFL3;
-	SRC_WT( grid ) = DFL3;
-	SRC_WB( grid ) = DFL3;
-	
-	CLEAR_ALL_FLAGS_SWEEP( grid );
-	SWEEP_END
+void LBM_initializeGrid(LBM_Grid grid) {
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  SRC_C(grid) = DFL1;
+  SRC_N(grid) = DFL2;
+  SRC_S(grid) = DFL2;
+  SRC_E(grid) = DFL2;
+  SRC_W(grid) = DFL2;
+  SRC_T(grid) = DFL2;
+  SRC_B(grid) = DFL2;
+  SRC_NE(grid) = DFL3;
+  SRC_NW(grid) = DFL3;
+  SRC_SE(grid) = DFL3;
+  SRC_SW(grid) = DFL3;
+  SRC_NT(grid) = DFL3;
+  SRC_NB(grid) = DFL3;
+  SRC_ST(grid) = DFL3;
+  SRC_SB(grid) = DFL3;
+  SRC_ET(grid) = DFL3;
+  SRC_EB(grid) = DFL3;
+  SRC_WT(grid) = DFL3;
+  SRC_WB(grid) = DFL3;
+
+  CLEAR_ALL_FLAGS_SWEEP(grid);
+  SWEEP_END
 }
 
 /******************************************************************************/
 
 /*############################################################################*/
 
-void LBM_swapGrids( LBM_Grid* grid1, LBM_Grid* grid2 ) {
-	LBM_Grid aux = *grid1;
-	*grid1 = *grid2;
-	*grid2 = aux;
+void LBM_swapGrids(LBM_Grid *grid1, LBM_Grid *grid2) {
+  LBM_Grid aux = *grid1;
+  *grid1 = *grid2;
+  *grid2 = aux;
 }
 
 /*############################################################################*/
 
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
-	int x,  y,  z;
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename) {
+  int x, y, z;
 
-	FILE* file = fopen( filename, "rb" );
+  FILE *file = fopen(filename, "rb");
 
-	for( z = 0; z < SIZE_Z; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
-			}
-			fgetc( file );
-		}
-		fgetc( file );
-	}
+  for (z = 0; z < SIZE_Z; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (fgetc(file) != '.')
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+      }
+      fgetc(file);
+    }
+    fgetc(file);
+  }
 
-	fclose( file );
+  fclose(file);
 }
 
 /*############################################################################*/
 
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
-	int x,  y,  z;
-
-	for( z = -2; z < SIZE_Z+2; z++ ) {
-		for( y = 0; y < SIZE_Y; y++ ) {
-			for( x = 0; x < SIZE_X; x++ ) {
-				if( x == 0 || x == SIZE_X-1 ||
-						y == 0 || y == SIZE_Y-1 ||
-						z == 0 || z == SIZE_Z-1 ) {
-					SET_FLAG( grid, x, y, z, OBSTACLE );
-				}
-				else {
-					if( (z == 1 || z == SIZE_Z-2) &&
-							x > 1 && x < SIZE_X-2 &&
-							y > 1 && y < SIZE_Y-2 ) {
-						SET_FLAG( grid, x, y, z, ACCEL );
-					}
-				}
-			}
-		}
-	}
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid) {
+  int x, y, z;
+
+  for (z = -2; z < SIZE_Z + 2; z++) {
+    for (y = 0; y < SIZE_Y; y++) {
+      for (x = 0; x < SIZE_X; x++) {
+        if (x == 0 || x == SIZE_X - 1 || y == 0 || y == SIZE_Y - 1 || z == 0 ||
+            z == SIZE_Z - 1) {
+          SET_FLAG(grid, x, y, z, OBSTACLE);
+        } else {
+          if ((z == 1 || z == SIZE_Z - 2) && x > 1 && x < SIZE_X - 2 && y > 1 &&
+              y < SIZE_Y - 2) {
+            SET_FLAG(grid, x, y, z, ACCEL);
+          }
+        }
+      }
+    }
+  }
 }
 
 /*############################################################################*/
 
-void LBM_showGridStatistics( LBM_Grid grid ) {
-	int nObstacleCells = 0,
-	    nAccelCells    = 0,
-	    nFluidCells    = 0;
-	float ux, uy, uz;
-	float minU2  = 1e+30, maxU2  = -1e+30, u2;
-	float minRho = 1e+30, maxRho = -1e+30, rho;
-	float mass = 0;
-
-	SWEEP_VAR
-
-		SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
-		rho = LOCAL( grid, C  ) + LOCAL( grid, N  )
-		+ LOCAL( grid, S  ) + LOCAL( grid, E  )
-		+ LOCAL( grid, W  ) + LOCAL( grid, T  )
-		+ LOCAL( grid, B  ) + LOCAL( grid, NE )
-		+ LOCAL( grid, NW ) + LOCAL( grid, SE )
-		+ LOCAL( grid, SW ) + LOCAL( grid, NT )
-		+ LOCAL( grid, NB ) + LOCAL( grid, ST )
-		+ LOCAL( grid, SB ) + LOCAL( grid, ET )
-		+ LOCAL( grid, EB ) + LOCAL( grid, WT )
-		+ LOCAL( grid, WB );
-
-	if( rho < minRho ) minRho = rho;
-	if( rho > maxRho ) maxRho = rho;
-	mass += rho;
-
-	if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
-		nObstacleCells++;
-	}
-	else {
-		if( TEST_FLAG_SWEEP( grid, ACCEL ))
-			nAccelCells++;
-		else
-			nFluidCells++;
-
-		ux = + LOCAL( grid, E  ) - LOCAL( grid, W  )
-			+ LOCAL( grid, NE ) - LOCAL( grid, NW )
-			+ LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, ET ) + LOCAL( grid, EB )
-			- LOCAL( grid, WT ) - LOCAL( grid, WB );
-		uy = + LOCAL( grid, N  ) - LOCAL( grid, S  )
-			+ LOCAL( grid, NE ) + LOCAL( grid, NW )
-			- LOCAL( grid, SE ) - LOCAL( grid, SW )
-			+ LOCAL( grid, NT ) + LOCAL( grid, NB )
-			- LOCAL( grid, ST ) - LOCAL( grid, SB );
-		uz = + LOCAL( grid, T  ) - LOCAL( grid, B  )
-			+ LOCAL( grid, NT ) - LOCAL( grid, NB )
-			+ LOCAL( grid, ST ) - LOCAL( grid, SB )
-			+ LOCAL( grid, ET ) - LOCAL( grid, EB )
-			+ LOCAL( grid, WT ) - LOCAL( grid, WB );
-		u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
-		if( u2 < minU2 ) minU2 = u2;
-		if( u2 > maxU2 ) maxU2 = u2;
-	}
-	SWEEP_END
-
-		printf( "LBM_showGridStatistics:\n"
-				"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
-				"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
-				"\tminU: %e maxU: %e\n\n",
-				nObstacleCells, nAccelCells, nFluidCells,
-				minRho, maxRho, mass,
-				sqrt( minU2 ), sqrt( maxU2 ) );
-
+void LBM_showGridStatistics(LBM_Grid grid) {
+  int nObstacleCells = 0, nAccelCells = 0, nFluidCells = 0;
+  float ux, uy, uz;
+  float minU2 = 1e+30, maxU2 = -1e+30, u2;
+  float minRho = 1e+30, maxRho = -1e+30, rho;
+  float mass = 0;
+
+  SWEEP_VAR
+
+  SWEEP_START(0, 0, 0, 0, 0, SIZE_Z)
+  rho = LOCAL(grid, C) + LOCAL(grid, N) + LOCAL(grid, S) + LOCAL(grid, E) +
+        LOCAL(grid, W) + LOCAL(grid, T) + LOCAL(grid, B) + LOCAL(grid, NE) +
+        LOCAL(grid, NW) + LOCAL(grid, SE) + LOCAL(grid, SW) + LOCAL(grid, NT) +
+        LOCAL(grid, NB) + LOCAL(grid, ST) + LOCAL(grid, SB) + LOCAL(grid, ET) +
+        LOCAL(grid, EB) + LOCAL(grid, WT) + LOCAL(grid, WB);
+
+  if (rho < minRho)
+    minRho = rho;
+  if (rho > maxRho)
+    maxRho = rho;
+  mass += rho;
+
+  if (TEST_FLAG_SWEEP(grid, OBSTACLE)) {
+    nObstacleCells++;
+  } else {
+    if (TEST_FLAG_SWEEP(grid, ACCEL))
+      nAccelCells++;
+    else
+      nFluidCells++;
+
+    ux = +LOCAL(grid, E) - LOCAL(grid, W) + LOCAL(grid, NE) - LOCAL(grid, NW) +
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, ET) + LOCAL(grid, EB) -
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    uy = +LOCAL(grid, N) - LOCAL(grid, S) + LOCAL(grid, NE) + LOCAL(grid, NW) -
+         LOCAL(grid, SE) - LOCAL(grid, SW) + LOCAL(grid, NT) + LOCAL(grid, NB) -
+         LOCAL(grid, ST) - LOCAL(grid, SB);
+    uz = +LOCAL(grid, T) - LOCAL(grid, B) + LOCAL(grid, NT) - LOCAL(grid, NB) +
+         LOCAL(grid, ST) - LOCAL(grid, SB) + LOCAL(grid, ET) - LOCAL(grid, EB) +
+         LOCAL(grid, WT) - LOCAL(grid, WB);
+    u2 = (ux * ux + uy * uy + uz * uz) / (rho * rho);
+    if (u2 < minU2)
+      minU2 = u2;
+    if (u2 > maxU2)
+      maxU2 = u2;
+  }
+  SWEEP_END
+
+  printf("LBM_showGridStatistics:\n"
+         "\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
+         "\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
+         "\tminU: %e maxU: %e\n\n",
+         nObstacleCells, nAccelCells, nFluidCells, minRho, maxRho, mass,
+         sqrt(minU2), sqrt(maxU2));
 }
 
 /*############################################################################*/
 
-static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		const char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
-
-		fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
-	else {                                                     /* little endian */
-		fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void storeValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    const char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      buffer[i] = vPtr[sizeof(OUTPUT_PRECISION) - i - 1];
+
+    fwrite(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+  } else { /* little endian */
+    fwrite(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
-	const int litteBigEndianTest = 1;
-	if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) {         /* big endian */
-		char* vPtr = (char*) v;
-		char buffer[sizeof( OUTPUT_PRECISION )];
-		int i;
-
-		fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
-
-		for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
-			vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
-	}
-	else {                                                     /* little endian */
-		fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
-	}
+static void loadValue(FILE *file, OUTPUT_PRECISION *v) {
+  const int litteBigEndianTest = 1;
+  if ((*((unsigned char *)&litteBigEndianTest)) == 0) { /* big endian */
+    char *vPtr = (char *)v;
+    char buffer[sizeof(OUTPUT_PRECISION)];
+    int i;
+
+    fread(buffer, sizeof(OUTPUT_PRECISION), 1, file);
+
+    for (i = 0; i < sizeof(OUTPUT_PRECISION); i++)
+      vPtr[i] = buffer[sizeof(OUTPUT_PRECISION) - i - 1];
+  } else { /* little endian */
+    fread(v, sizeof(OUTPUT_PRECISION), 1, file);
+  }
 }
 
 /*############################################################################*/
 
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-		const int binary ) {
-	OUTPUT_PRECISION rho, ux, uy, uz;
-
-	FILE* file = fopen( filename, (binary ? "wb" : "w") );
-
-	SWEEP_VAR
-	SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
-				rho = + SRC_C( grid ) + SRC_N( grid )
-					+ SRC_S( grid ) + SRC_E( grid )
-					+ SRC_W( grid ) + SRC_T( grid )
-					+ SRC_B( grid ) + SRC_NE( grid )
-					+ SRC_NW( grid ) + SRC_SE( grid )
-					+ SRC_SW( grid ) + SRC_NT( grid )
-					+ SRC_NB( grid ) + SRC_ST( grid )
-					+ SRC_SB( grid ) + SRC_ET( grid )
-					+ SRC_EB( grid ) + SRC_WT( grid )
-					+ SRC_WB( grid );
-				ux = + SRC_E( grid ) - SRC_W( grid ) 
-					+ SRC_NE( grid ) - SRC_NW( grid ) 
-					+ SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_ET( grid ) + SRC_EB( grid ) 
-					- SRC_WT( grid ) - SRC_WB( grid );
-				uy = + SRC_N( grid ) - SRC_S( grid ) 
-					+ SRC_NE( grid ) + SRC_NW( grid ) 
-					- SRC_SE( grid ) - SRC_SW( grid ) 
-					+ SRC_NT( grid ) + SRC_NB( grid ) 
-					- SRC_ST( grid ) - SRC_SB( grid );
-				uz = + SRC_T( grid ) - SRC_B( grid ) 
-					+ SRC_NT( grid ) - SRC_NB( grid ) 
-					+ SRC_ST( grid ) - SRC_SB( grid ) 
-					+ SRC_ET( grid ) - SRC_EB( grid ) 
-					+ SRC_WT( grid ) - SRC_WB( grid );
-				ux /= rho;
-				uy /= rho;
-				uz /= rho;
-
-				if( binary ) {
-					/*
-					   fwrite( &ux, sizeof( ux ), 1, file );
-					   fwrite( &uy, sizeof( uy ), 1, file );
-					   fwrite( &uz, sizeof( uz ), 1, file );
-					   */
-					storeValue( file, &ux );
-					storeValue( file, &uy );
-					storeValue( file, &uz );
-				} else
-					fprintf( file, "%e %e %e\n", ux, uy, uz );
-
-	SWEEP_END;
-
-	fclose( file );
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const int binary) {
+  OUTPUT_PRECISION rho, ux, uy, uz;
+
+  FILE *file = fopen(filename, (binary ? "wb" : "w"));
+
+  SWEEP_VAR
+  SWEEP_START(0, 0, 0, SIZE_X, SIZE_Y, SIZE_Z)
+  rho = +SRC_C(grid) + SRC_N(grid) + SRC_S(grid) + SRC_E(grid) + SRC_W(grid) +
+        SRC_T(grid) + SRC_B(grid) + SRC_NE(grid) + SRC_NW(grid) + SRC_SE(grid) +
+        SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) + SRC_ST(grid) +
+        SRC_SB(grid) + SRC_ET(grid) + SRC_EB(grid) + SRC_WT(grid) +
+        SRC_WB(grid);
+  ux = +SRC_E(grid) - SRC_W(grid) + SRC_NE(grid) - SRC_NW(grid) + SRC_SE(grid) -
+       SRC_SW(grid) + SRC_ET(grid) + SRC_EB(grid) - SRC_WT(grid) - SRC_WB(grid);
+  uy = +SRC_N(grid) - SRC_S(grid) + SRC_NE(grid) + SRC_NW(grid) - SRC_SE(grid) -
+       SRC_SW(grid) + SRC_NT(grid) + SRC_NB(grid) - SRC_ST(grid) - SRC_SB(grid);
+  uz = +SRC_T(grid) - SRC_B(grid) + SRC_NT(grid) - SRC_NB(grid) + SRC_ST(grid) -
+       SRC_SB(grid) + SRC_ET(grid) - SRC_EB(grid) + SRC_WT(grid) - SRC_WB(grid);
+  ux /= rho;
+  uy /= rho;
+  uz /= rho;
+
+  if (binary) {
+    /*
+       fwrite( &ux, sizeof( ux ), 1, file );
+       fwrite( &uy, sizeof( uy ), 1, file );
+       fwrite( &uz, sizeof( uz ), 1, file );
+       */
+    storeValue(file, &ux);
+    storeValue(file, &uy);
+    storeValue(file, &uz);
+  } else
+    fprintf(file, "%e %e %e\n", ux, uy, uz);
+
+  SWEEP_END;
+
+  fclose(file);
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h
index 4768d949a9cae7bf2d118feaa0b2200667200b2f..8f2d5fde2470862aea3efb51bb46262f86008518 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm.h
@@ -13,15 +13,15 @@
 
 /*############################################################################*/
 
-void LBM_allocateGrid( float** ptr );
-void LBM_freeGrid( float** ptr );
-void LBM_initializeGrid( LBM_Grid grid );
-void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
-void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
-void LBM_swapGrids( LBM_Grid* grid1, LBM_Grid* grid2 );
-void LBM_showGridStatistics( LBM_Grid Grid );
-void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
-                           const BOOL binary );
+void LBM_allocateGrid(float **ptr);
+void LBM_freeGrid(float **ptr);
+void LBM_initializeGrid(LBM_Grid grid);
+void LBM_initializeSpecialCellsForLDC(LBM_Grid grid);
+void LBM_loadObstacleFile(LBM_Grid grid, const char *filename);
+void LBM_swapGrids(LBM_Grid *grid1, LBM_Grid *grid2);
+void LBM_showGridStatistics(LBM_Grid Grid);
+void LBM_storeVelocityField(LBM_Grid grid, const char *filename,
+                            const BOOL binary);
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
index d8ceb373dfe0e7a02d374c78a9c3fd68cc8f3085..ae91da7c5c304aac7537de0c68edffcd91a83dbe 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/lbm_macros.h
@@ -17,160 +17,181 @@
 #define TRUE (-1)
 #define FALSE (0)
 
-#define DFL1 (1.0f/ 3.0f)
-#define DFL2 (1.0f/18.0f)
-#define DFL3 (1.0f/36.0f)
+#define DFL1 (1.0f / 3.0f)
+#define DFL2 (1.0f / 18.0f)
+#define DFL3 (1.0f / 36.0f)
 
 /*############################################################################*/
 
-typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
-typedef LBM_Grid* LBM_GridPtr;
+typedef float
+    *LBM_Grid; // float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
+typedef LBM_Grid *LBM_GridPtr;
 
 /*############################################################################*/
 
-
-#define SWEEP_X  __temp_x__
-#define SWEEP_Y  __temp_y__
-#define SWEEP_Z  __temp_z__
+#define SWEEP_X __temp_x__
+#define SWEEP_Y __temp_y__
+#define SWEEP_Z __temp_z__
 #define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
 
-#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
-	for( __temp_z__ = z1; \
-	     __temp_z__ < z2; \
-		__temp_z__++) { \
-            for( __temp_y__ = 0; \
-                 __temp_y__ < SIZE_Y; \
-                 __temp_y__++) { \
-		for(__temp_x__ = 0; \
-	            __temp_x__ < SIZE_X; \
-                    __temp_x__++) { \
-
-#define SWEEP_END }}}
-
-
-#define GRID_ENTRY(g,x,y,z,e)          ((g)[CALC_INDEX( x,  y,  z, e)])
-#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
-
-#define LOCAL(g,e)       (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_C(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0,  0, e ))
-#define NEIGHBOR_N(g,e)  (GRID_ENTRY_SWEEP( g,  0, +1,  0, e ))
-#define NEIGHBOR_S(g,e)  (GRID_ENTRY_SWEEP( g,  0, -1,  0, e ))
-#define NEIGHBOR_E(g,e)  (GRID_ENTRY_SWEEP( g, +1,  0,  0, e ))
-#define NEIGHBOR_W(g,e)  (GRID_ENTRY_SWEEP( g, -1,  0,  0, e ))
-#define NEIGHBOR_T(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, +1, e ))
-#define NEIGHBOR_B(g,e)  (GRID_ENTRY_SWEEP( g,  0,  0, -1, e ))
-#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1,  0, e ))
-#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1,  0, e ))
-#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1,  0, e ))
-#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1,  0, e ))
-#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, +1, e ))
-#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g,  0, +1, -1, e ))
-#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, +1, e ))
-#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g,  0, -1, -1, e ))
-#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, +1, e ))
-#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1,  0, -1, e ))
-#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, +1, e ))
-#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1,  0, -1, e ))
-
+#define SWEEP_START(x1, y1, z1, x2, y2, z2)                                    \
+  for (__temp_z__ = z1; __temp_z__ < z2; __temp_z__++) {                       \
+    for (__temp_y__ = 0; __temp_y__ < SIZE_Y; __temp_y__++) {                  \
+      for (__temp_x__ = 0; __temp_x__ < SIZE_X; __temp_x__++) {
+
+#define SWEEP_END                                                              \
+  }                                                                            \
+  }                                                                            \
+  }
+
+#define GRID_ENTRY(g, x, y, z, e) ((g)[CALC_INDEX(x, y, z, e)])
+#define GRID_ENTRY_SWEEP(g, dx, dy, dz, e)                                     \
+  ((g)[CALC_INDEX((dx) + SWEEP_X, (dy) + SWEEP_Y, (dz) + SWEEP_Z, e)])
+
+#define LOCAL(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_C(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, 0, e))
+#define NEIGHBOR_N(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, 0, e))
+#define NEIGHBOR_S(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, 0, e))
+#define NEIGHBOR_E(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, 0, e))
+#define NEIGHBOR_W(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, 0, e))
+#define NEIGHBOR_T(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, +1, e))
+#define NEIGHBOR_B(g, e) (GRID_ENTRY_SWEEP(g, 0, 0, -1, e))
+#define NEIGHBOR_NE(g, e) (GRID_ENTRY_SWEEP(g, +1, +1, 0, e))
+#define NEIGHBOR_NW(g, e) (GRID_ENTRY_SWEEP(g, -1, +1, 0, e))
+#define NEIGHBOR_SE(g, e) (GRID_ENTRY_SWEEP(g, +1, -1, 0, e))
+#define NEIGHBOR_SW(g, e) (GRID_ENTRY_SWEEP(g, -1, -1, 0, e))
+#define NEIGHBOR_NT(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, +1, e))
+#define NEIGHBOR_NB(g, e) (GRID_ENTRY_SWEEP(g, 0, +1, -1, e))
+#define NEIGHBOR_ST(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, +1, e))
+#define NEIGHBOR_SB(g, e) (GRID_ENTRY_SWEEP(g, 0, -1, -1, e))
+#define NEIGHBOR_ET(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, +1, e))
+#define NEIGHBOR_EB(g, e) (GRID_ENTRY_SWEEP(g, +1, 0, -1, e))
+#define NEIGHBOR_WT(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, +1, e))
+#define NEIGHBOR_WB(g, e) (GRID_ENTRY_SWEEP(g, -1, 0, -1, e))
 
 #ifdef SCATTER
 
-#define SRC_C(g)  (LOCAL( g, C  ))
-#define SRC_N(g)  (LOCAL( g, N  ))
-#define SRC_S(g)  (LOCAL( g, S  ))
-#define SRC_E(g)  (LOCAL( g, E  ))
-#define SRC_W(g)  (LOCAL( g, W  ))
-#define SRC_T(g)  (LOCAL( g, T  ))
-#define SRC_B(g)  (LOCAL( g, B  ))
-#define SRC_NE(g) (LOCAL( g, NE ))
-#define SRC_NW(g) (LOCAL( g, NW ))
-#define SRC_SE(g) (LOCAL( g, SE ))
-#define SRC_SW(g) (LOCAL( g, SW ))
-#define SRC_NT(g) (LOCAL( g, NT ))
-#define SRC_NB(g) (LOCAL( g, NB ))
-#define SRC_ST(g) (LOCAL( g, ST ))
-#define SRC_SB(g) (LOCAL( g, SB ))
-#define SRC_ET(g) (LOCAL( g, ET ))
-#define SRC_EB(g) (LOCAL( g, EB ))
-#define SRC_WT(g) (LOCAL( g, WT ))
-#define SRC_WB(g) (LOCAL( g, WB ))
-
-#define DST_C(g)  (NEIGHBOR_C ( g, C  ))
-#define DST_N(g)  (NEIGHBOR_N ( g, N  ))
-#define DST_S(g)  (NEIGHBOR_S ( g, S  ))
-#define DST_E(g)  (NEIGHBOR_E ( g, E  ))
-#define DST_W(g)  (NEIGHBOR_W ( g, W  ))
-#define DST_T(g)  (NEIGHBOR_T ( g, T  ))
-#define DST_B(g)  (NEIGHBOR_B ( g, B  ))
-#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
-#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
-#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
-#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
-#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
-#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
-#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
-#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
-#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
-#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
-#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
-#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
+#define SRC_C(g) (LOCAL(g, C))
+#define SRC_N(g) (LOCAL(g, N))
+#define SRC_S(g) (LOCAL(g, S))
+#define SRC_E(g) (LOCAL(g, E))
+#define SRC_W(g) (LOCAL(g, W))
+#define SRC_T(g) (LOCAL(g, T))
+#define SRC_B(g) (LOCAL(g, B))
+#define SRC_NE(g) (LOCAL(g, NE))
+#define SRC_NW(g) (LOCAL(g, NW))
+#define SRC_SE(g) (LOCAL(g, SE))
+#define SRC_SW(g) (LOCAL(g, SW))
+#define SRC_NT(g) (LOCAL(g, NT))
+#define SRC_NB(g) (LOCAL(g, NB))
+#define SRC_ST(g) (LOCAL(g, ST))
+#define SRC_SB(g) (LOCAL(g, SB))
+#define SRC_ET(g) (LOCAL(g, ET))
+#define SRC_EB(g) (LOCAL(g, EB))
+#define SRC_WT(g) (LOCAL(g, WT))
+#define SRC_WB(g) (LOCAL(g, WB))
+
+#define DST_C(g) (NEIGHBOR_C(g, C))
+#define DST_N(g) (NEIGHBOR_N(g, N))
+#define DST_S(g) (NEIGHBOR_S(g, S))
+#define DST_E(g) (NEIGHBOR_E(g, E))
+#define DST_W(g) (NEIGHBOR_W(g, W))
+#define DST_T(g) (NEIGHBOR_T(g, T))
+#define DST_B(g) (NEIGHBOR_B(g, B))
+#define DST_NE(g) (NEIGHBOR_NE(g, NE))
+#define DST_NW(g) (NEIGHBOR_NW(g, NW))
+#define DST_SE(g) (NEIGHBOR_SE(g, SE))
+#define DST_SW(g) (NEIGHBOR_SW(g, SW))
+#define DST_NT(g) (NEIGHBOR_NT(g, NT))
+#define DST_NB(g) (NEIGHBOR_NB(g, NB))
+#define DST_ST(g) (NEIGHBOR_ST(g, ST))
+#define DST_SB(g) (NEIGHBOR_SB(g, SB))
+#define DST_ET(g) (NEIGHBOR_ET(g, ET))
+#define DST_EB(g) (NEIGHBOR_EB(g, EB))
+#define DST_WT(g) (NEIGHBOR_WT(g, WT))
+#define DST_WB(g) (NEIGHBOR_WB(g, WB))
 
 #else /* GATHER */
 
-#define SRC_C(g)  (NEIGHBOR_C ( g, C  ))
-#define SRC_N(g)  (NEIGHBOR_S ( g, N  ))
-#define SRC_S(g)  (NEIGHBOR_N ( g, S  ))
-#define SRC_E(g)  (NEIGHBOR_W ( g, E  ))
-#define SRC_W(g)  (NEIGHBOR_E ( g, W  ))
-#define SRC_T(g)  (NEIGHBOR_B ( g, T  ))
-#define SRC_B(g)  (NEIGHBOR_T ( g, B  ))
-#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
-#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
-#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
-#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
-#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
-#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
-#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
-#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
-#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
-#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
-#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
-#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
-
-#define DST_C(g)  (LOCAL( g, C  ))
-#define DST_N(g)  (LOCAL( g, N  ))
-#define DST_S(g)  (LOCAL( g, S  ))
-#define DST_E(g)  (LOCAL( g, E  ))
-#define DST_W(g)  (LOCAL( g, W  ))
-#define DST_T(g)  (LOCAL( g, T  ))
-#define DST_B(g)  (LOCAL( g, B  ))
-#define DST_NE(g) (LOCAL( g, NE ))
-#define DST_NW(g) (LOCAL( g, NW ))
-#define DST_SE(g) (LOCAL( g, SE ))
-#define DST_SW(g) (LOCAL( g, SW ))
-#define DST_NT(g) (LOCAL( g, NT ))
-#define DST_NB(g) (LOCAL( g, NB ))
-#define DST_ST(g) (LOCAL( g, ST ))
-#define DST_SB(g) (LOCAL( g, SB ))
-#define DST_ET(g) (LOCAL( g, ET ))
-#define DST_EB(g) (LOCAL( g, EB ))
-#define DST_WT(g) (LOCAL( g, WT ))
-#define DST_WB(g) (LOCAL( g, WB ))
+#define SRC_C(g) (NEIGHBOR_C(g, C))
+#define SRC_N(g) (NEIGHBOR_S(g, N))
+#define SRC_S(g) (NEIGHBOR_N(g, S))
+#define SRC_E(g) (NEIGHBOR_W(g, E))
+#define SRC_W(g) (NEIGHBOR_E(g, W))
+#define SRC_T(g) (NEIGHBOR_B(g, T))
+#define SRC_B(g) (NEIGHBOR_T(g, B))
+#define SRC_NE(g) (NEIGHBOR_SW(g, NE))
+#define SRC_NW(g) (NEIGHBOR_SE(g, NW))
+#define SRC_SE(g) (NEIGHBOR_NW(g, SE))
+#define SRC_SW(g) (NEIGHBOR_NE(g, SW))
+#define SRC_NT(g) (NEIGHBOR_SB(g, NT))
+#define SRC_NB(g) (NEIGHBOR_ST(g, NB))
+#define SRC_ST(g) (NEIGHBOR_NB(g, ST))
+#define SRC_SB(g) (NEIGHBOR_NT(g, SB))
+#define SRC_ET(g) (NEIGHBOR_WB(g, ET))
+#define SRC_EB(g) (NEIGHBOR_WT(g, EB))
+#define SRC_WT(g) (NEIGHBOR_EB(g, WT))
+#define SRC_WB(g) (NEIGHBOR_ET(g, WB))
+
+#define DST_C(g) (LOCAL(g, C))
+#define DST_N(g) (LOCAL(g, N))
+#define DST_S(g) (LOCAL(g, S))
+#define DST_E(g) (LOCAL(g, E))
+#define DST_W(g) (LOCAL(g, W))
+#define DST_T(g) (LOCAL(g, T))
+#define DST_B(g) (LOCAL(g, B))
+#define DST_NE(g) (LOCAL(g, NE))
+#define DST_NW(g) (LOCAL(g, NW))
+#define DST_SE(g) (LOCAL(g, SE))
+#define DST_SW(g) (LOCAL(g, SW))
+#define DST_NT(g) (LOCAL(g, NT))
+#define DST_NB(g) (LOCAL(g, NB))
+#define DST_ST(g) (LOCAL(g, ST))
+#define DST_SB(g) (LOCAL(g, SB))
+#define DST_ET(g) (LOCAL(g, ET))
+#define DST_EB(g) (LOCAL(g, EB))
+#define DST_WT(g) (LOCAL(g, WT))
+#define DST_WB(g) (LOCAL(g, WB))
 
 #endif /* GATHER */
 
-#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
-#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
-
-#define TEST_FLAG_SWEEP(g,f)     ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
-#define SET_FLAG_SWEEP(g,f)      {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG_SWEEP(g,f)    {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_)  =    0;}
-
-#define TEST_FLAG(g,x,y,z,f)     ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
-#define SET_FLAG(g,x,y,z,f)      {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |=  (f);}
-#define CLEAR_FLAG(g,x,y,z,f)    {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
-#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_)  =    0;}
+#define MAGIC_CAST(v) ((unsigned int *)((void *)(&(v))))
+#define FLAG_VAR(v) unsigned int *_aux_ = MAGIC_CAST(v)
+
+#define TEST_FLAG_SWEEP(g, f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
+#define SET_FLAG_SWEEP(g, f)                                                   \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG_SWEEP(g, f)                                                 \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS_SWEEP(g)                                               \
+  {                                                                            \
+    FLAG_VAR(LOCAL(g, FLAGS));                                                 \
+    (*_aux_) = 0;                                                              \
+  }
+
+#define TEST_FLAG(g, x, y, z, f)                                               \
+  ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
+#define SET_FLAG(g, x, y, z, f)                                                \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) |= (f);                                                           \
+  }
+#define CLEAR_FLAG(g, x, y, z, f)                                              \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) &= ~(f);                                                          \
+  }
+#define CLEAR_ALL_FLAGS(g, x, y, z)                                            \
+  {                                                                            \
+    FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS));                                   \
+    (*_aux_) = 0;                                                              \
+  }
 
 /*############################################################################*/
 
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
index 91e9722353a40ba21911003b298616bde879b497..b51864366b500fc796d9073fe1893be2f402797f 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.cpp
@@ -15,11 +15,11 @@
 #include <visc.h>
 
 #include "layout_config.h"
+#include "lbm.h"
 #include "lbm_macros.h"
 #include "main.h"
-#include "lbm.h"
 
-#define AS_UINT(x) (*((unsigned*)&(x)))
+#define AS_UINT(x) (*((unsigned *)&(x)))
 
 /*############################################################################*/
 
@@ -29,404 +29,396 @@ static LBM_Grid srcGrid, dstGrid;
 
 struct pb_TimerSet timers;
 
-
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-    struct stat fileStat;
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *params) {
+  struct stat fileStat;
 
-    if( nArgs < 2 ) {
-        printf( "syntax: lbm <time steps>\n" );
-        exit( 1 );
-    }
+  if (nArgs < 2) {
+    printf("syntax: lbm <time steps>\n");
+    exit(1);
+  }
+
+  param->nTimeSteps = atoi(arg[1]);
 
-    param->nTimeSteps     = atoi( arg[1] );
-
-    if( params->inpFiles[0] != NULL ) {
-        param->obstacleFilename = params->inpFiles[0];
-
-        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-                    param->obstacleFilename );
-            exit( 1 );
-        }
-        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-            printf( "MAIN_parseCommandLine:\n"
-                    "\tsize of file '%s' is %i bytes\n"
-                    "\texpected size is %i bytes\n",
-                    param->obstacleFilename, (int) fileStat.st_size,
-                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-            exit( 1 );
-        }
+  if (params->inpFiles[0] != NULL) {
+    param->obstacleFilename = params->inpFiles[0];
+
+    if (stat(param->obstacleFilename, &fileStat) != 0) {
+      printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+             param->obstacleFilename);
+      exit(1);
+    }
+    if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
+      printf("MAIN_parseCommandLine:\n"
+             "\tsize of file '%s' is %i bytes\n"
+             "\texpected size is %i bytes\n",
+             param->obstacleFilename, (int)fileStat.st_size,
+             SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
+      exit(1);
     }
-    else param->obstacleFilename = NULL;
+  } else
+    param->obstacleFilename = NULL;
 
-    param->resultFilename = params->outFile;
+  param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
-void MAIN_printInfo( const MAIN_Param* param ) {
-    printf( "MAIN_printInfo:\n"
-            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-            "\tnTimeSteps     : %i\n"
-            "\tresult file    : %s\n"
-            "\taction         : %s\n"
-            "\tsimulation type: %s\n"
-            "\tobstacle file  : %s\n\n",
-            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-            param->nTimeSteps, param->resultFilename,
-            "store", "lid-driven cavity",
-            (param->obstacleFilename == NULL) ? "<none>" :
-            param->obstacleFilename );
+void MAIN_printInfo(const MAIN_Param *param) {
+  printf("MAIN_printInfo:\n"
+         "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+         "\tnTimeSteps     : %i\n"
+         "\tresult file    : %s\n"
+         "\taction         : %s\n"
+         "\tsimulation type: %s\n"
+         "\tobstacle file  : %s\n\n",
+         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
+         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
+         (param->obstacleFilename == NULL) ? "<none>"
+                                           : param->obstacleFilename);
 }
 
 /*############################################################################*/
 
 typedef struct __attribute__((__packed__)) {
-    float* srcG; size_t bytes_srcG;
-    float* dstG; size_t bytes_dstG;
-    size_t dim_X1, dim_X2, dim_Y2;
+  float *srcG;
+  size_t bytes_srcG;
+  float *dstG;
+  size_t bytes_dstG;
+  size_t dim_X1, dim_X2, dim_Y2;
 } RootIn;
 
-void performStreamCollide_kernel( float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG )
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(2, srcG, dstG, 1, dstG);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    srcG += MARGIN;
-    dstG += MARGIN;
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-
-    //Using some predefined macros here.  Consider this the declaration
-    //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
-
-    SWEEP_VAR
-    SWEEP_X = lx; // get_local_id(0)
-    SWEEP_Y = gx; // get_group_id(0)
-    SWEEP_Z = gy; // get_group_id(1)
-
-    float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
-    float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ;
-    float tempSB, tempET, tempEB, tempWT, tempWB ;
-
-    //Load all of the input fields
-    //This is a gather operation of the SCATTER preprocessor variable
-    // is undefined in layout_config.h, or a "local" read otherwise
-    tempC = SRC_C(srcG);
-
-    tempN = SRC_N(srcG);
-    tempS = SRC_S(srcG);
-    tempE = SRC_E(srcG);
-    tempW = SRC_W(srcG);
-    tempT = SRC_T(srcG);
-    tempB = SRC_B(srcG);
-
-    tempNE = SRC_NE(srcG);
-    tempNW = SRC_NW(srcG);
-    tempSE = SRC_SE(srcG);
-    tempSW = SRC_SW(srcG);
-    tempNT = SRC_NT(srcG);
-    tempNB = SRC_NB(srcG);
-    tempST = SRC_ST(srcG);
-    tempSB = SRC_SB(srcG);
-    tempET = SRC_ET(srcG);
-    tempEB = SRC_EB(srcG);
-    tempWT = SRC_WT(srcG);
-    tempWB = SRC_WB(srcG);
-
-    //Test whether the cell is fluid or obstacle
-    if(AS_UINT(LOCAL(srcG,FLAGS)) & (OBSTACLE)) {
-
-        //Swizzle the inputs: reflect any fluid coming into this cell
-        // back to where it came from
-        temp_swp = tempN ;
-        tempN = tempS ;
-        tempS = temp_swp ;
-        temp_swp = tempE ;
-        tempE = tempW ;
-        tempW = temp_swp;
-        temp_swp = tempT ;
-        tempT = tempB ;
-        tempB = temp_swp;
-        temp_swp = tempNE;
-        tempNE = tempSW ;
-        tempSW = temp_swp;
-        temp_swp = tempNW;
-        tempNW = tempSE ;
-        tempSE = temp_swp;
-        temp_swp = tempNT ;
-        tempNT = tempSB ;
-        tempSB = temp_swp;
-        temp_swp = tempNB ;
-        tempNB = tempST ;
-        tempST = temp_swp;
-        temp_swp = tempET ;
-        tempET= tempWB ;
-        tempWB = temp_swp;
-        temp_swp = tempEB ;
-        tempEB = tempWT ;
-        tempWT = temp_swp;
-    }
-    else {
-
-        //The math meat of LBM: ignore for optimization
-        float ux, uy, uz, rho, u2;
-        float temp1, temp2, temp_base;
-        rho = tempC + tempN
-              + tempS + tempE
-              + tempW + tempT
-              + tempB + tempNE
-              + tempNW + tempSE
-              + tempSW + tempNT
-              + tempNB + tempST
-              + tempSB + tempET
-              + tempEB + tempWT
-              + tempWB;
-
-        ux = + tempE - tempW
-             + tempNE - tempNW
-             + tempSE - tempSW
-             + tempET + tempEB
-             - tempWT - tempWB;
-
-        uy = + tempN - tempS
-             + tempNE + tempNW
-             - tempSE - tempSW
-             + tempNT + tempNB
-             - tempST - tempSB;
-
-        uz = + tempT - tempB
-             + tempNT - tempNB
-             + tempST - tempSB
-             + tempET - tempEB
-             + tempWT - tempWB;
-
-        ux /= rho;
-        uy /= rho;
-        uz /= rho;
-
-        if(AS_UINT(LOCAL(srcG,FLAGS)) & (ACCEL)) {
-
-            ux = 0.005f;
-            uy = 0.002f;
-            uz = 0.000f;
-        }
-
-        u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f;
-        temp_base = OMEGA*rho;
-        temp1 = DFL1*temp_base;
-
-        //Put the output values for this cell in the shared memory
-        temp_base = OMEGA*rho;
-        temp1 = DFL1*temp_base;
-        temp2 = 1.0f-OMEGA;
-        tempC = temp2*tempC + temp1*(                                 - u2);
-        temp1 = DFL2*temp_base;
-        tempN = temp2*tempN + temp1*(       uy*(4.5f*uy       + 3.0f) - u2);
-        tempS = temp2*tempS + temp1*(       uy*(4.5f*uy       - 3.0f) - u2);
-        tempT = temp2*tempT + temp1*(       uz*(4.5f*uz       + 3.0f) - u2);
-        tempB = temp2*tempB + temp1*(       uz*(4.5f*uz       - 3.0f) - u2);
-        tempE = temp2*tempE + temp1*(       ux*(4.5f*ux       + 3.0f) - u2);
-        tempW = temp2*tempW + temp1*(       ux*(4.5f*ux       - 3.0f) - u2);
-        temp1 = DFL3*temp_base;
-        tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
-        tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
-        tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
-        tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
-        tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
-        tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
-        tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
-        tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
-        tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
-        tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
-        tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
-        tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
+void performStreamCollide_kernel(float *srcG, size_t bytes_srcG, float *dstG,
+                                 size_t bytes_dstG) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, srcG, dstG, 1, dstG);
+
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
+
+  srcG += MARGIN;
+  dstG += MARGIN;
+
+  int lx = __visc__getNodeInstanceID_x(thisNode);
+  int gx = __visc__getNodeInstanceID_x(parentNode);
+  int gy = __visc__getNodeInstanceID_y(parentNode);
+
+  // Using some predefined macros here.  Consider this the declaration
+  //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
+
+  SWEEP_VAR
+  SWEEP_X = lx; // get_local_id(0)
+  SWEEP_Y = gx; // get_group_id(0)
+  SWEEP_Z = gy; // get_group_id(1)
+
+  float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
+  float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST;
+  float tempSB, tempET, tempEB, tempWT, tempWB;
+
+  // Load all of the input fields
+  // This is a gather operation of the SCATTER preprocessor variable
+  // is undefined in layout_config.h, or a "local" read otherwise
+  tempC = SRC_C(srcG);
+
+  tempN = SRC_N(srcG);
+  tempS = SRC_S(srcG);
+  tempE = SRC_E(srcG);
+  tempW = SRC_W(srcG);
+  tempT = SRC_T(srcG);
+  tempB = SRC_B(srcG);
+
+  tempNE = SRC_NE(srcG);
+  tempNW = SRC_NW(srcG);
+  tempSE = SRC_SE(srcG);
+  tempSW = SRC_SW(srcG);
+  tempNT = SRC_NT(srcG);
+  tempNB = SRC_NB(srcG);
+  tempST = SRC_ST(srcG);
+  tempSB = SRC_SB(srcG);
+  tempET = SRC_ET(srcG);
+  tempEB = SRC_EB(srcG);
+  tempWT = SRC_WT(srcG);
+  tempWB = SRC_WB(srcG);
+
+  // Test whether the cell is fluid or obstacle
+  if (AS_UINT(LOCAL(srcG, FLAGS)) & (OBSTACLE)) {
+
+    // Swizzle the inputs: reflect any fluid coming into this cell
+    // back to where it came from
+    temp_swp = tempN;
+    tempN = tempS;
+    tempS = temp_swp;
+    temp_swp = tempE;
+    tempE = tempW;
+    tempW = temp_swp;
+    temp_swp = tempT;
+    tempT = tempB;
+    tempB = temp_swp;
+    temp_swp = tempNE;
+    tempNE = tempSW;
+    tempSW = temp_swp;
+    temp_swp = tempNW;
+    tempNW = tempSE;
+    tempSE = temp_swp;
+    temp_swp = tempNT;
+    tempNT = tempSB;
+    tempSB = temp_swp;
+    temp_swp = tempNB;
+    tempNB = tempST;
+    tempST = temp_swp;
+    temp_swp = tempET;
+    tempET = tempWB;
+    tempWB = temp_swp;
+    temp_swp = tempEB;
+    tempEB = tempWT;
+    tempWT = temp_swp;
+  } else {
+
+    // The math meat of LBM: ignore for optimization
+    float ux, uy, uz, rho, u2;
+    float temp1, temp2, temp_base;
+    rho = tempC + tempN + tempS + tempE + tempW + tempT + tempB + tempNE +
+          tempNW + tempSE + tempSW + tempNT + tempNB + tempST + tempSB +
+          tempET + tempEB + tempWT + tempWB;
+
+    ux = +tempE - tempW + tempNE - tempNW + tempSE - tempSW + tempET + tempEB -
+         tempWT - tempWB;
+
+    uy = +tempN - tempS + tempNE + tempNW - tempSE - tempSW + tempNT + tempNB -
+         tempST - tempSB;
+
+    uz = +tempT - tempB + tempNT - tempNB + tempST - tempSB + tempET - tempEB +
+         tempWT - tempWB;
+
+    ux /= rho;
+    uy /= rho;
+    uz /= rho;
+
+    if (AS_UINT(LOCAL(srcG, FLAGS)) & (ACCEL)) {
+
+      ux = 0.005f;
+      uy = 0.002f;
+      uz = 0.000f;
     }
 
-    //Write the results computed above
-    //This is a scatter operation of the SCATTER preprocessor variable
-    // is defined in layout_config.h, or a "local" write otherwise
-    DST_C ( dstG ) = tempC;
-
-    DST_N ( dstG ) = tempN;
-    DST_S ( dstG ) = tempS;
-    DST_E ( dstG ) = tempE;
-    DST_W ( dstG ) = tempW;
-    DST_T ( dstG ) = tempT;
-    DST_B ( dstG ) = tempB;
-
-    DST_NE( dstG ) = tempNE;
-    DST_NW( dstG ) = tempNW;
-    DST_SE( dstG ) = tempSE;
-    DST_SW( dstG ) = tempSW;
-    DST_NT( dstG ) = tempNT;
-    DST_NB( dstG ) = tempNB;
-    DST_ST( dstG ) = tempST;
-    DST_SB( dstG ) = tempSB;
-    DST_ET( dstG ) = tempET;
-    DST_EB( dstG ) = tempEB;
-    DST_WT( dstG ) = tempWT;
-    DST_WB( dstG ) = tempWB;
+    u2 = 1.5f * (ux * ux + uy * uy + uz * uz) - 1.0f;
+    temp_base = OMEGA * rho;
+    temp1 = DFL1 * temp_base;
+
+    // Put the output values for this cell in the shared memory
+    temp_base = OMEGA * rho;
+    temp1 = DFL1 * temp_base;
+    temp2 = 1.0f - OMEGA;
+    tempC = temp2 * tempC + temp1 * (-u2);
+    temp1 = DFL2 * temp_base;
+    tempN = temp2 * tempN + temp1 * (uy * (4.5f * uy + 3.0f) - u2);
+    tempS = temp2 * tempS + temp1 * (uy * (4.5f * uy - 3.0f) - u2);
+    tempT = temp2 * tempT + temp1 * (uz * (4.5f * uz + 3.0f) - u2);
+    tempB = temp2 * tempB + temp1 * (uz * (4.5f * uz - 3.0f) - u2);
+    tempE = temp2 * tempE + temp1 * (ux * (4.5f * ux + 3.0f) - u2);
+    tempW = temp2 * tempW + temp1 * (ux * (4.5f * ux - 3.0f) - u2);
+    temp1 = DFL3 * temp_base;
+    tempNT =
+        temp2 * tempNT + temp1 * ((+uy + uz) * (4.5f * (+uy + uz) + 3.0f) - u2);
+    tempNB =
+        temp2 * tempNB + temp1 * ((+uy - uz) * (4.5f * (+uy - uz) + 3.0f) - u2);
+    tempST =
+        temp2 * tempST + temp1 * ((-uy + uz) * (4.5f * (-uy + uz) + 3.0f) - u2);
+    tempSB =
+        temp2 * tempSB + temp1 * ((-uy - uz) * (4.5f * (-uy - uz) + 3.0f) - u2);
+    tempNE =
+        temp2 * tempNE + temp1 * ((+ux + uy) * (4.5f * (+ux + uy) + 3.0f) - u2);
+    tempSE =
+        temp2 * tempSE + temp1 * ((+ux - uy) * (4.5f * (+ux - uy) + 3.0f) - u2);
+    tempET =
+        temp2 * tempET + temp1 * ((+ux + uz) * (4.5f * (+ux + uz) + 3.0f) - u2);
+    tempEB =
+        temp2 * tempEB + temp1 * ((+ux - uz) * (4.5f * (+ux - uz) + 3.0f) - u2);
+    tempNW =
+        temp2 * tempNW + temp1 * ((-ux + uy) * (4.5f * (-ux + uy) + 3.0f) - u2);
+    tempSW =
+        temp2 * tempSW + temp1 * ((-ux - uy) * (4.5f * (-ux - uy) + 3.0f) - u2);
+    tempWT =
+        temp2 * tempWT + temp1 * ((-ux + uz) * (4.5f * (-ux + uz) + 3.0f) - u2);
+    tempWB =
+        temp2 * tempWB + temp1 * ((-ux - uz) * (4.5f * (-ux - uz) + 3.0f) - u2);
+  }
+
+  // Write the results computed above
+  // This is a scatter operation of the SCATTER preprocessor variable
+  // is defined in layout_config.h, or a "local" write otherwise
+  DST_C(dstG) = tempC;
+
+  DST_N(dstG) = tempN;
+  DST_S(dstG) = tempS;
+  DST_E(dstG) = tempE;
+  DST_W(dstG) = tempW;
+  DST_T(dstG) = tempT;
+  DST_B(dstG) = tempB;
+
+  DST_NE(dstG) = tempNE;
+  DST_NW(dstG) = tempNW;
+  DST_SE(dstG) = tempSE;
+  DST_SW(dstG) = tempSW;
+  DST_NT(dstG) = tempNT;
+  DST_NB(dstG) = tempNB;
+  DST_ST(dstG) = tempST;
+  DST_SB(dstG) = tempSB;
+  DST_ET(dstG) = tempET;
+  DST_EB(dstG) = tempEB;
+  DST_WT(dstG) = tempWT;
+  DST_WB(dstG) = tempWB;
 }
 
-void lbmLvl1(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1)
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(2, srcG, dstG, 1, dstG);
-    void* lbm_node = __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1);
-    __visc__bindIn(lbm_node, 0, 0, 0);
-    __visc__bindIn(lbm_node, 1, 1, 0);
-    __visc__bindIn(lbm_node, 2, 2, 0);
-    __visc__bindIn(lbm_node, 3, 3, 0);
+void lbmLvl1(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
+             size_t dim_X1) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, srcG, dstG, 1, dstG);
+  void *lbm_node =
+      __visc__createNodeND(2, performStreamCollide_kernel, dim_X1, (size_t)1);
+  __visc__bindIn(lbm_node, 0, 0, 0);
+  __visc__bindIn(lbm_node, 1, 1, 0);
+  __visc__bindIn(lbm_node, 2, 2, 0);
+  __visc__bindIn(lbm_node, 3, 3, 0);
 }
 
-void lbmLvl2(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2)
-{
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(2, srcG, dstG, 1, dstG);
-    void* lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2);
-    __visc__bindIn(lbm_node, 0, 0, 0);
-    __visc__bindIn(lbm_node, 1, 1, 0);
-    __visc__bindIn(lbm_node, 2, 2, 0);
-    __visc__bindIn(lbm_node, 3, 3, 0);
-    __visc__bindIn(lbm_node, 4, 4, 0);
+void lbmLvl2(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
+             size_t dim_X1, size_t dim_X2, size_t dim_Y2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, srcG, dstG, 1, dstG);
+  void *lbm_node = __visc__createNodeND(2, lbmLvl1, dim_X2, dim_Y2);
+  __visc__bindIn(lbm_node, 0, 0, 0);
+  __visc__bindIn(lbm_node, 1, 1, 0);
+  __visc__bindIn(lbm_node, 2, 2, 0);
+  __visc__bindIn(lbm_node, 3, 3, 0);
+  __visc__bindIn(lbm_node, 4, 4, 0);
 }
 
-void lbmLvl3(float* srcG, size_t bytes_srcG, float* dstG, size_t bytes_dstG, size_t dim_X1, size_t dim_X2, size_t dim_Y2)
-{
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(2, srcG, dstG, 1, dstG);
-    void* lbm_node = __visc__createNodeND(0, lbmLvl2);
-    __visc__bindIn(lbm_node, 0, 0, 0);
-    __visc__bindIn(lbm_node, 1, 1, 0);
-    __visc__bindIn(lbm_node, 2, 2, 0);
-    __visc__bindIn(lbm_node, 3, 3, 0);
-    __visc__bindIn(lbm_node, 4, 4, 0);
-    __visc__bindIn(lbm_node, 5, 5, 0);
-    __visc__bindIn(lbm_node, 6, 6, 0);
+void lbmLvl3(float *srcG, size_t bytes_srcG, float *dstG, size_t bytes_dstG,
+             size_t dim_X1, size_t dim_X2, size_t dim_Y2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, srcG, dstG, 1, dstG);
+  void *lbm_node = __visc__createNodeND(0, lbmLvl2);
+  __visc__bindIn(lbm_node, 0, 0, 0);
+  __visc__bindIn(lbm_node, 1, 1, 0);
+  __visc__bindIn(lbm_node, 2, 2, 0);
+  __visc__bindIn(lbm_node, 3, 3, 0);
+  __visc__bindIn(lbm_node, 4, 4, 0);
+  __visc__bindIn(lbm_node, 5, 5, 0);
+  __visc__bindIn(lbm_node, 6, 6, 0);
 }
 
-__attribute__((noinline)) void MAIN_performStreamCollide( LBM_Grid src, LBM_Grid dst ) {
-
-    long dimBlock[3] = {SIZE_X,1,1};
-    long dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
-    size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-    
-    void* root_in = malloc(sizeof(RootIn));
-    RootIn root_in_local = {
-        src - MARGIN, size,
-        dst - MARGIN, size,
-        SIZE_X, SIZE_Y, SIZE_Z
-    };
-    *(RootIn*)root_in = root_in_local;
-    void* lbmDFG = __visc__launch(0, lbmLvl3, root_in);
-    
-    __visc__wait(lbmDFG);
+__attribute__((noinline)) void MAIN_performStreamCollide(LBM_Grid src,
+                                                         LBM_Grid dst) {
+
+  long dimBlock[3] = {SIZE_X, 1, 1};
+  long dimGrid[3] = {SIZE_X * SIZE_Y, SIZE_Z, 1};
+  size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+
+  void *root_in = malloc(sizeof(RootIn));
+  RootIn root_in_local = {src - MARGIN, size,   dst - MARGIN, size,
+                          SIZE_X,       SIZE_Y, SIZE_Z};
+  *(RootIn *)root_in = root_in_local;
+  void *lbmDFG = __visc__launch(0, lbmLvl3, root_in);
 
+  __visc__wait(lbmDFG);
 }
 
-void MAIN_initialize( const MAIN_Param* param ) {
+void MAIN_initialize(const MAIN_Param *param) {
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //Setup datastructures
-    LBM_allocateGrid( (float**) &srcGrid );
-    LBM_allocateGrid( (float**) &dstGrid );
-    LBM_initializeGrid( srcGrid );
-    LBM_initializeGrid( dstGrid );
-
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    if( param->obstacleFilename != NULL ) {
-        LBM_loadObstacleFile( srcGrid, param->obstacleFilename );
-        LBM_loadObstacleFile( dstGrid, param->obstacleFilename );
-    }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // Setup datastructures
+  LBM_allocateGrid((float **)&srcGrid);
+  LBM_allocateGrid((float **)&dstGrid);
+  LBM_initializeGrid(srcGrid);
+  LBM_initializeGrid(dstGrid);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    LBM_initializeSpecialCellsForLDC( srcGrid );
-    LBM_initializeSpecialCellsForLDC( dstGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  if (param->obstacleFilename != NULL) {
+    LBM_loadObstacleFile(srcGrid, param->obstacleFilename);
+    LBM_loadObstacleFile(dstGrid, param->obstacleFilename);
+  }
 
-    LBM_showGridStatistics( srcGrid );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  LBM_initializeSpecialCellsForLDC(srcGrid);
+  LBM_initializeSpecialCellsForLDC(dstGrid);
 
-    //LBM_freeGrid( (float**) &srcGrid );
-    //LBM_freeGrid( (float**) &dstGrid );
+  LBM_showGridStatistics(srcGrid);
+
+  // LBM_freeGrid( (float**) &srcGrid );
+  // LBM_freeGrid( (float**) &dstGrid );
 }
 
 /*############################################################################*/
 
-void MAIN_finalize( const MAIN_Param* param ) {
-
-    //Setup TEMP datastructures
+void MAIN_finalize(const MAIN_Param *param) {
 
-    LBM_showGridStatistics( srcGrid );
+  // Setup TEMP datastructures
 
-    LBM_storeVelocityField( srcGrid, param->resultFilename, TRUE );
+  LBM_showGridStatistics(srcGrid);
 
-    LBM_freeGrid( (float**) &srcGrid );
-    LBM_freeGrid( (float**) &dstGrid );
+  LBM_storeVelocityField(srcGrid, param->resultFilename, TRUE);
 
+  LBM_freeGrid((float **)&srcGrid);
+  LBM_freeGrid((float **)&dstGrid);
 }
 
-int main( int nArgs, char* arg[] ) {
-    MAIN_Param param;
-    int t;
+int main(int nArgs, char *arg[]) {
+  MAIN_Param param;
+  int t;
 
-    struct pb_Parameters* params;
-    params = pb_ReadParameters(&nArgs, arg);
+  struct pb_Parameters *params;
+  params = pb_ReadParameters(&nArgs, arg);
 
+  // Setup TEMP datastructures
+  MAIN_parseCommandLine(nArgs, arg, &param, params);
+  MAIN_printInfo(&param);
 
-    //Setup TEMP datastructures
-    MAIN_parseCommandLine( nArgs, arg, &param, params );
-    MAIN_printInfo( &param );
+  MAIN_initialize(&param);
 
-    MAIN_initialize( &param );
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
+  llvm_visc_track_mem(srcGrid - MARGIN, size);
+  llvm_visc_track_mem(dstGrid - MARGIN, size);
 
-    size_t size   = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(srcGrid-MARGIN, size);
-    llvm_visc_track_mem(dstGrid-MARGIN, size);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  for (t = 1; t <= param.nTimeSteps; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+    MAIN_performStreamCollide(srcGrid, dstGrid);
 
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    for( t = 1; t <= param.nTimeSteps; t++ ) {
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        MAIN_performStreamCollide( srcGrid, dstGrid );
-
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        LBM_swapGrids( &srcGrid, &dstGrid );
+    LBM_swapGrids(&srcGrid, &dstGrid);
 
-        /*if( (t & 63) == 0 ) {*/
-            /*printf( "timestep: %i\n", t );*/
+    /*if( (t & 63) == 0 ) {*/
+    /*printf( "timestep: %i\n", t );*/
 #if 0
             CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
             LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-        /*}*/
-    }
+    /*}*/
+  }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    llvm_visc_request_mem(srcGrid-MARGIN, size);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(srcGrid - MARGIN, size);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-    llvm_visc_untrack_mem(srcGrid-MARGIN);
-    llvm_visc_untrack_mem(dstGrid-MARGIN);
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  llvm_visc_untrack_mem(srcGrid - MARGIN);
+  llvm_visc_untrack_mem(dstGrid - MARGIN);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    MAIN_finalize( &param );
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  MAIN_finalize(&param);
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    pb_FreeParameters(params);
-    return 0;
+  pb_FreeParameters(params);
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h
index 17728ccb8652dda18f4e08e16c567ec0d4abe4b5..e7d1de926379246587e72c53a3bed3eff4444f0a 100644
--- a/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h
+++ b/hpvm/test/parboil/benchmarks/lbm/src/visc/main.h
@@ -12,17 +12,18 @@
 /*############################################################################*/
 
 typedef struct {
-	int nTimeSteps;
-	char* resultFilename;
-	char* obstacleFilename;
+  int nTimeSteps;
+  char *resultFilename;
+  char *obstacleFilename;
 } MAIN_Param;
 
 /*############################################################################*/
 
-void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
-void MAIN_printInfo( const MAIN_Param* param );
-void MAIN_initialize( const MAIN_Param* param );
-void MAIN_finalize( const MAIN_Param* param );
+void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
+                           struct pb_Parameters *);
+void MAIN_printInfo(const MAIN_Param *param);
+void MAIN_initialize(const MAIN_Param *param);
+void MAIN_finalize(const MAIN_Param *param);
 void MAIN_performStreamCollide(LBM_Grid src, LBM_Grid dst);
 
 /*############################################################################*/
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc
index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/base/io.cc
@@ -10,15 +10,15 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc
index e6fb3a580d0947bd4a7ba9c1e4e7634f5a6fffae..f856cbe45d18d02ce08563b296e0aef9eed4c178 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/base/main.cc
@@ -6,27 +6,28 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include "sgemm_kernel.cc"
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
-#include "sgemm_kernel.cc"
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -37,49 +38,44 @@ main (int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
- 
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
   /* Read in data */
   pb_SwitchToTimer(&timers, pb_TimerID_IO);
 
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // allocate space for C
-  std::vector<float> matC(matArow*matBcol);
+  std::vector<float> matC(matArow * matBcol);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f,
-      &matA.front(), matArow, &matBT.front(), matBcol, 0.0f, &matC.front(),
-      matArow);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), matArow,
+             &matBT.front(), matBcol, 0.0f, &matC.front(), matArow);
 
   if (params->outFile) {
     /* Write C to file */
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); 
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   double CPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_COMPUTE]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/CPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / CPUtime / 1e9
+            << std::endl;
   pb_PrintTimerSet(&timers);
   pb_FreeParameters(params);
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc b/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc
index e46fac0c88a53d54df310c42695d06391b4a9c3e..b38116ced427728b4b3d8e90a9591cdeeda8c967 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/base/sgemm_kernel.cc
@@ -6,33 +6,32 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Base C implementation of MM
  */
 
-
-
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                const float *A, int lda, const float *B, int ldb, float beta,
+                float *C, int ldc) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   for (int mm = 0; mm < m; ++mm) {
     for (int nn = 0; nn < n; ++nn) {
       float c = 0.0f;
       for (int i = 0; i < k; ++i) {
-        float a = A[mm + i * lda]; 
+        float a = A[mm + i * lda];
         float b = B[nn + i * ldb];
         c += a * b;
       }
-      C[mm+nn*ldc] = C[mm+nn*ldc] * beta + alpha * c;
+      C[mm + nn * ldc] = C[mm + nn * ldc] * beta + alpha * c;
     }
   }
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc
index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/cuda/io.cc
@@ -10,15 +10,15 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc
index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/cuda_base/io.cc
@@ -10,15 +10,15 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc
index 61defbd7ea670c56ec4b0d2a321a9800376667aa..f62566c651aeca227c06a0c58784f34d7cb6f751 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/io.cc
@@ -10,15 +10,15 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -27,32 +27,30 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc
index 20b377a74a4662dbf1d53a0ca92e3c2e7a64a0e0..20140a1ada56ffd36bd367023c5023305b14ae2e 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/main.cc
@@ -6,28 +6,31 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
-extern void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc );
+extern void basicSgemm(char transa, char transb, int m, int n, int k,
+                       float alpha, const float *A, int lda, const float *B,
+                       int ldb, float beta, float *C, int ldc);
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -38,49 +41,44 @@ main (int argc, char *argv[]) {
 
   pb_InitializeTimerSet(&timers);
 
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
- 
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
+
   /* Read in data */
   pb_SwitchToTimer(&timers, pb_TimerID_IO);
 
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // allocate space for C
-  std::vector<float> matC(matArow*matBcol);
+  std::vector<float> matC(matArow * matBcol);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f,
-      &matA.front(), matArow, &matBT.front(), matBcol, 0.0f, &matC.front(),
-      matArow);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), matArow,
+             &matBT.front(), matBcol, 0.0f, &matC.front(), matArow);
 
   if (params->outFile) {
     /* Write C to file */
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC); 
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   double CPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_COMPUTE]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/CPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / CPUtime / 1e9
+            << std::endl;
   pb_PrintTimerSet(&timers);
   pb_FreeParameters(params);
   return 0;
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc
index 53b3835c0eee1c251c015e1dcf3655184e8393e0..b9896c17312967ba7a431becb1b3bda4b315ba4a 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/omp_base/sgemm_kernel.cc
@@ -6,34 +6,34 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Base C implementation of MM
  */
 
 #include <iostream>
 
-
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, const float *A, int lda, const float *B, int ldb, float beta, float *C, int ldc )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                const float *A, int lda, const float *B, int ldb, float beta,
+                float *C, int ldc) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  #pragma omp parallel for collapse (2)
+#pragma omp parallel for collapse(2)
   for (int mm = 0; mm < m; ++mm) {
     for (int nn = 0; nn < n; ++nn) {
       float c = 0.0f;
       for (int i = 0; i < k; ++i) {
-        float a = A[mm + i * lda]; 
+        float a = A[mm + i * lda];
         float b = B[nn + i * ldb];
         c += a * b;
       }
-      C[mm+nn*ldc] = C[mm+nn*ldc] * beta + alpha * c;
+      C[mm + nn * ldc] = C[mm + nn * ldc] * beta + alpha * c;
     }
   }
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
index 21c9e2301099395daf2a95e43c6365f93e1d7859..5489f6a55ce6e8ba3676b0c98ad4b37ac7f4a7fd 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base/main.cc
@@ -6,79 +6,83 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <CL/cl.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <CL/cl.h>
-#include <parboil.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_SZ 16
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << ": " << clStatus << " Error!\n";              \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_SZ) || (n%TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-      << "; n should be multiple of " << TILE_SZ << std::endl;
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
   }
 
-  size_t db[2] = {TILE_SZ,TILE_SZ};
-  size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  size_t db[2] = {TILE_SZ, TILE_SZ};
+  size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
 
   cl_int clStatus;
- 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
-  clStatus = clFinish(clCommandQueue); 
+  clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -88,146 +92,151 @@ int main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
 
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
-  //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
+  // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
-
-  //char clOptions[50];
-  //sprintf(clOptions,"");
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT",
+      &clContext, &clDevice, &clProgram, &clKernel);
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // char clOptions[50];
+  // sprintf(clOptions,"");
 
-  //size_t binarySizes = 0;
-  //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySizes, NULL);
-  //CHECK_ERROR("clGetProgramInfo")
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //std::cout << "Binary Size = " << binarySizes << "\n";
+  // size_t binarySizes = 0;
+  // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES,
+  // sizeof(size_t), &binarySizes, NULL); CHECK_ERROR("clGetProgramInfo")
 
-  //unsigned char* binaries = (unsigned char*) malloc(binarySizes);
-  //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySizes, &binaries, NULL);
-  //CHECK_ERROR("clGetProgramInfo")
+  // std::cout << "Binary Size = " << binarySizes << "\n";
 
-  //std::cout << "Binary = \n" << binaries << "\n";
+  // unsigned char* binaries = (unsigned char*) malloc(binarySizes);
+  // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySizes,
+  // &binaries, NULL); CHECK_ERROR("clGetProgramInfo")
 
+  // std::cout << "Binary = \n" << binaries << "\n";
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-	matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+             matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
   clStatus = clReleaseMemObject(dB);
   clStatus = clReleaseMemObject(dC);
   clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
- 
+  clStatus = clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-   
+
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc
index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
-	
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc
index c8118d34bb10e229ce604e8bfab05b9923f2e315..105baf590da13dd2ffc3cb803d63291daef0854d 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_opt/main.cc
@@ -6,42 +6,45 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
 #include <CL/cl.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 16
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n";   \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                  cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                  int ldc, cl_kernel clKernel,
+                  cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     exit(1);
@@ -53,38 +56,38 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c
   }
 
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
     exit(1);
   }
-  
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
-  //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
+
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
+  // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
-  
+
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -94,136 +97,142 @@ main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      printf("%s\n",params->inpFiles[0]);
-      printf("%s\n",params->inpFiles[1]);
-      printf("%s\n",params->inpFiles[2]);
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    printf("%s\n", params->inpFiles[0]);
+    printf("%s\n", params->inpFiles[1]);
+    printf("%s\n", params->inpFiles[2]);
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
- 
+
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
-  
+
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
+  // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-  //char clOptions[50];
-  //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
+  // char clOptions[50];
+  // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D
+  // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
 
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("build/opencl_base_opt_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_base_opt_default/kernel.nvptx.s", "mysgemmNT", &clContext,
+      &clDevice, &clProgram, &clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-   
+
   // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-        matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue);
+  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+               matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
- 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
   clReleaseMemObject(dB);
   clReleaseMemObject(dC);
   clReleaseCommandQueue(clCommandQueue);
-  clReleaseContext(clContext); 
-  
+  clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-    
+
     /* Write C to file */
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc
index bfcd70395e74a4cdcd39d3d7f609cf2c7a2d702f..f72c18c293c52e322a35814b13c000f9b64548b0 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_base_vec/main.cc
@@ -6,82 +6,86 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <CL/cl.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <CL/cl.h>
-#include <parboil.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_SZ 16
 #define VEC_SZ 8
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << ": " << clStatus << " Error!\n";              \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_SZ) || (n%TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-      << "; n should be multiple of " << TILE_SZ << std::endl;
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
   }
 
-  size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
-  size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  size_t db[2] = {TILE_SZ / VEC_SZ, TILE_SZ};
+  size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
 
   std::cout << "(" << dg[0] << ", " << dg[1] << ")\n";
   std::cout << "(" << db[0] << ", " << db[1] << ")\n";
   cl_int clStatus;
- 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
-  clStatus = clFinish(clCommandQueue); 
+  clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -91,134 +95,141 @@ int main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
 
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  // const char* clSource[] = {readFile("src/opencl_base/kernel_offline.nvptx.s")};
-  // cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // const char* clSource[] =
+  // {readFile("src/opencl_base/kernel_offline.nvptx.s")}; cl_program clProgram
+  // = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("build/opencl_base_vec_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_base_vec_default/kernel_offline.nvptx.s", "mysgemmNT",
+      &clContext, &clDevice, &clProgram, &clKernel);
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-  //char clOptions[50];
-  //sprintf(clOptions,"");
+  // char clOptions[50];
+  // sprintf(clOptions,"");
 
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // Copy A and B^T into device memory
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-	matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+             matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
   clStatus = clReleaseMemObject(dB);
   clStatus = clReleaseMemObject(dC);
   clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
- 
+  clStatus = clReleaseContext(clContext);
+
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-   
+
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
index 5c75ac7367a284f8acd797393c13bc8384856bcb..744ee4096664e2f11620fae388a0a848a8cd49ac 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu/main.cc
@@ -6,80 +6,84 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <CL/cl.h>
+#include <fstream>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <fstream>
-#include <CL/cl.h>
-#include <parboil.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_SZ 16
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << ": " << clStatus << " Error!\n";              \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_SZ) || (n%TILE_SZ)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-      << "; n should be multiple of " << TILE_SZ << std::endl;
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
   }
 
-  size_t db[2] = {TILE_SZ,TILE_SZ};
-  size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  size_t db[2] = {TILE_SZ, TILE_SZ};
+  size_t dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
 
   cl_int clStatus;
- 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
-  clStatus = clFinish(clCommandQueue); 
+  clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -89,33 +93,27 @@ int main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
 
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
 
-  cl_uint numPlatforms; 
+  cl_uint numPlatforms;
   clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
@@ -124,151 +122,163 @@ int main (int argc, char *argv[]) {
 
   char buffer[1000];
   size_t bytes;
-  clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, buffer, &bytes);
+  clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000,
+                               buffer, &bytes);
   CHECK_ERROR("clGetPlatformInfo")
 
   printf("\nExtensions: %s\n", buffer);
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  //printf("Device id = %p\n", clDevice);
-  //cl_device_partition_property props[3];
-  //props[0] = CL_DEVICE_PARTITION_EQUALLY;
-  //props[1] = 1;
-  //props[2] = 0;
-  //cl_device_id subdevice_id[8];
-  //cl_uint num_entries = 8;
-
-  //cl_uint numDevices;
-  //clCreateSubDevices(clDevice, props, num_entries, subdevice_id, &numDevices);
-  //printf("Num of devices = %d\n", numDevices);
-  //for(unsigned i =0 ; i< numDevices; i++) {
-    //printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
+  // printf("Device id = %p\n", clDevice);
+  // cl_device_partition_property props[3];
+  // props[0] = CL_DEVICE_PARTITION_EQUALLY;
+  // props[1] = 1;
+  // props[2] = 0;
+  // cl_device_id subdevice_id[8];
+  // cl_uint num_entries = 8;
+
+  // cl_uint numDevices;
+  // clCreateSubDevices(clDevice, props, num_entries, subdevice_id,
+  // &numDevices); printf("Num of devices = %d\n", numDevices); for(unsigned i
+  // =0 ; i< numDevices; i++) { printf("Subdevice id %d = %p\n", i,
+  // subdevice_id[i]);
   //}
-  //clDevice = subdevice_id[0];
- 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  //cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-  cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
+  // clDevice = subdevice_id[0];
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  // cl_context clContext =
+  // clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
+  cl_context clContext =
+      clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};
-  //const char* clSource[] = {readFile("kernel-spir-64-2.bc")};
-  //size_t binarySize = 1112;
-  //std::cout << "Size of binary = " << binarySize << "\n";
-  //cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, &binarySize, (const unsigned char**)clSource, NULL, &clStatus);
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //cl_kernel clKernel;
-  //cl_program clProgram;
-  //pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
+  const char *clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};
+  // const char* clSource[] = {readFile("kernel-spir-64-2.bc")};
+  // size_t binarySize = 1112;
+  // std::cout << "Size of binary = " << binarySize << "\n";
+  // cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice,
+  // &binarySize, (const unsigned char**)clSource, NULL, &clStatus);
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  // cl_kernel clKernel;
+  // cl_program clProgram;
+  // pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s",
+  // "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel); cl_program
+  // clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"");
+  sprintf(clOptions, "");
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  //size_t binarySize;
-  //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, NULL);
-  //CHECK_ERROR("clGetProgramInfo")
-  //std::cout << "Binary Size = " << binarySize << "\n";
-  //unsigned char* binary = (unsigned char*) malloc(binarySize*sizeof(unsigned char));
-  //size_t returnSize = 0;
-  //clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, binarySize*sizeof(unsigned char) , &binary, &returnSize);
-  //CHECK_ERROR("clGetProgramInfo")
-
-  //std::ofstream kernelfile;
-  //kernelfile.open ("kernel.o", std::ios::out | std::ios::binary);
-  //for(unsigned i=0; i<binarySize; i++)
-    //kernelfile << binary[i];
-  //kernelfile.close();
-
-  //free(binary);
-  //std::cout << "Output binary\n";
-
-
-  cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // size_t binarySize;
+  // clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES,
+  // sizeof(size_t), &binarySize, NULL); CHECK_ERROR("clGetProgramInfo")
+  // std::cout << "Binary Size = " << binarySize << "\n";
+  // unsigned char* binary = (unsigned char*) malloc(binarySize*sizeof(unsigned
+  // char)); size_t returnSize = 0; clStatus = clGetProgramInfo(clProgram,
+  // CL_PROGRAM_BINARIES, binarySize*sizeof(unsigned char) , &binary,
+  // &returnSize); CHECK_ERROR("clGetProgramInfo")
+
+  // std::ofstream kernelfile;
+  // kernelfile.open ("kernel.o", std::ios::out | std::ios::binary);
+  // for(unsigned i=0; i<binarySize; i++)
+  // kernelfile << binary[i];
+  // kernelfile.close();
+
+  // free(binary);
+  // std::cout << "Output binary\n";
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // Copy A and B^T into device memory
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-	matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+             matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
   clStatus = clReleaseMemObject(dB);
   clStatus = clReleaseMemObject(dC);
   clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
- 
+  clStatus = clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-   
+
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc
index 898295a6bf1f313279820dd5c8366a2a66f03ef9..45ed8e942a1a69475b75a63a24b70655f1ffa2aa 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_4K/main.cc
@@ -6,26 +6,28 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <CL/cl.h>
+#include <fstream>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <fstream>
-#include <CL/cl.h>
-#include <parboil.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 //#define TILE_SZ 16
@@ -33,65 +35,68 @@ extern char* readFile(const char*);
 
 #define TILE_N 8
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << ": " << clStatus << " Error!\n";              \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue,
+                pb_TimerSet &timers) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   // In this code we assume the matrix sizes are multiple of tile size
-  //if ((m%TILE_SZ) || (n%TILE_SZ)) {
-    //std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-      //<< "; n should be multiple of " << TILE_SZ << std::endl;
+  // if ((m%TILE_SZ) || (n%TILE_SZ)) {
+  // std::cerr << "unsupported size of matrix. m should be multiple of " <<
+  // TILE_SZ
+  //<< "; n should be multiple of " << TILE_SZ << std::endl;
   //}
 
-  //size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
-  //size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  // size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
+  // size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
 
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
   cl_int clStatus;
- 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
   pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  for(int i=0; i<1; i++) {
-  
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  for (int i = 0; i < 1; i++) {
+
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                      0, NULL, NULL);
     CHECK_ERROR("clEnqueueNDRangeKernel")
 
-    clStatus = clFinish(clCommandQueue); 
+    clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
   }
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -101,168 +106,172 @@ int main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
 
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
 
-  cl_uint numPlatforms; 
+  cl_uint numPlatforms;
   clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
   clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  //char buffer[1000];
-  //size_t bytes;
-  //clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, buffer, &bytes);
-  //CHECK_ERROR("clGetPlatformInfo")
+  // char buffer[1000];
+  // size_t bytes;
+  // clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000,
+  // buffer, &bytes); CHECK_ERROR("clGetPlatformInfo")
 
-  //printf("\nExtensions: %s\n", buffer);
+  // printf("\nExtensions: %s\n", buffer);
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  //printf("Device id = %p\n", clDevice);
-  //cl_device_partition_property props[3];
-  //props[0] = CL_DEVICE_PARTITION_EQUALLY;
-  //props[1] = 1;
-  //props[2] = 0;
-  //cl_device_id subdevice_id[8];
-  //cl_uint num_entries = 8;
-
-  //cl_uint numDevices;
-  //clCreateSubDevices(clDevice, props, num_entries, subdevice_id, &numDevices);
-  //printf("Num of devices = %d\n", numDevices);
-  //for(unsigned i =0 ; i< numDevices; i++) {
-    //printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
+  // printf("Device id = %p\n", clDevice);
+  // cl_device_partition_property props[3];
+  // props[0] = CL_DEVICE_PARTITION_EQUALLY;
+  // props[1] = 1;
+  // props[2] = 0;
+  // cl_device_id subdevice_id[8];
+  // cl_uint num_entries = 8;
+
+  // cl_uint numDevices;
+  // clCreateSubDevices(clDevice, props, num_entries, subdevice_id,
+  // &numDevices); printf("Num of devices = %d\n", numDevices); for(unsigned i
+  // =0 ; i< numDevices; i++) { printf("Subdevice id %d = %p\n", i,
+  // subdevice_id[i]);
   //}
-  //clDevice = subdevice_id[0];
- 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  //cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-  cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
+  // clDevice = subdevice_id[0];
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  // cl_context clContext =
+  // clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
+  cl_context clContext =
+      clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  //size_t binarySize = 1112;
-  //std::cout << "Size of binary = " << binarySize << "\n";
-  //cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, &binarySize, (const unsigned char**)clSource, NULL, &clStatus);
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // size_t binarySize = 1112;
+  // std::cout << "Size of binary = " << binarySize << "\n";
+  // cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice,
+  // &binarySize, (const unsigned char**)clSource, NULL, &clStatus); cl_program
+  // clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext,
+                                    &clDevice, &clProgram, &clKernel);
   CHECK_ERROR("Binary")
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
-
-  //char clOptions[50];
-  //sprintf(clOptions,"");
-
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
+  // char clOptions[50];
+  // sprintf(clOptions,"");
 
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // Copy A and B^T into device memory
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-	matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+             matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
   clStatus = clReleaseMemObject(dB);
   clStatus = clReleaseMemObject(dC);
   clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
- 
+  clStatus = clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-   
+
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc
index 1d05e00de1b594071a3c58e816c6e31124140854..d8275be777079f1a57e585b3057685f737f38ed3 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_baseline/main.cc
@@ -6,26 +6,28 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <CL/cl.h>
+#include <fstream>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <fstream>
-#include <CL/cl.h>
-#include <parboil.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 //#define TILE_SZ 16
@@ -33,65 +35,68 @@ extern char* readFile(const char*);
 
 #define TILE_N 16
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << ": " << clStatus << " Error!\n";              \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue,
+                pb_TimerSet &timers) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   // In this code we assume the matrix sizes are multiple of tile size
-  //if ((m%TILE_SZ) || (n%TILE_SZ)) {
-    //std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-      //<< "; n should be multiple of " << TILE_SZ << std::endl;
+  // if ((m%TILE_SZ) || (n%TILE_SZ)) {
+  // std::cerr << "unsupported size of matrix. m should be multiple of " <<
+  // TILE_SZ
+  //<< "; n should be multiple of " << TILE_SZ << std::endl;
   //}
 
-  //size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
-  //size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  // size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
+  // size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
 
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
   cl_int clStatus;
- 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  //pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  //for(int i=0; i<15; i++) {
-  
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
+  // pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  // for(int i=0; i<15; i++) {
 
-    clStatus = clFinish(clCommandQueue); 
-    CHECK_ERROR("clFinish")
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
   //}
-  //pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  // pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -101,33 +106,27 @@ int main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
 
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
 
-  cl_uint numPlatforms; 
+  cl_uint numPlatforms;
   clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
@@ -135,102 +134,109 @@ int main (int argc, char *argv[]) {
   CHECK_ERROR("clGetPlatformIDs")
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  cl_context clContext =
+      clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"");
+  sprintf(clOptions, "");
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-
-
-  cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-	matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+             matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
   clStatus = clReleaseMemObject(dB);
   clStatus = clReleaseMemObject(dC);
   clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
- 
+  clStatus = clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-   
+
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc
index 6bd1d9b3e0b16ddb4054f4fdf14dc4aa7d544b19..b4e561ded6b82bf2b84aa4dbab2f5f4b5bceab7b 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_medium/main.cc
@@ -6,26 +6,28 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <CL/cl.h>
+#include <fstream>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <fstream>
-#include <CL/cl.h>
-#include <parboil.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 //#define TILE_SZ 16
@@ -33,65 +35,68 @@ extern char* readFile(const char*);
 
 #define TILE_N 8
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<< errorMessage <<": "<< clStatus <<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << ": " << clStatus << " Error!\n";              \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers )
-{
+void basicSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue,
+                pb_TimerSet &timers) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   if ((transb != 'T') && (transb != 't')) {
     std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
     return;
   }
-  
+
   // In this code we assume the matrix sizes are multiple of tile size
-  //if ((m%TILE_SZ) || (n%TILE_SZ)) {
-    //std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-      //<< "; n should be multiple of " << TILE_SZ << std::endl;
+  // if ((m%TILE_SZ) || (n%TILE_SZ)) {
+  // std::cerr << "unsupported size of matrix. m should be multiple of " <<
+  // TILE_SZ
+  //<< "; n should be multiple of " << TILE_SZ << std::endl;
   //}
 
-  //size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
-  //size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  // size_t db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
+  // size_t dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
 
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
   cl_int clStatus;
- 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
   pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  //for(int i=0; i<15; i++) {
-  
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
+  // for(int i=0; i<15; i++) {
 
-    clStatus = clFinish(clCommandQueue); 
-    CHECK_ERROR("clFinish")
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
+  CHECK_ERROR("clEnqueueNDRangeKernel")
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
   //}
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -101,168 +106,172 @@ int main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
 
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
 
-  cl_uint numPlatforms; 
+  cl_uint numPlatforms;
   clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
   clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  //char buffer[1000];
-  //size_t bytes;
-  //clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000, buffer, &bytes);
-  //CHECK_ERROR("clGetPlatformInfo")
+  // char buffer[1000];
+  // size_t bytes;
+  // clStatus = clGetPlatformInfo(clPlatform[1], CL_PLATFORM_EXTENSIONS, 1000,
+  // buffer, &bytes); CHECK_ERROR("clGetPlatformInfo")
 
-  //printf("\nExtensions: %s\n", buffer);
+  // printf("\nExtensions: %s\n", buffer);
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  //printf("Device id = %p\n", clDevice);
-  //cl_device_partition_property props[3];
-  //props[0] = CL_DEVICE_PARTITION_EQUALLY;
-  //props[1] = 1;
-  //props[2] = 0;
-  //cl_device_id subdevice_id[8];
-  //cl_uint num_entries = 8;
-
-  //cl_uint numDevices;
-  //clCreateSubDevices(clDevice, props, num_entries, subdevice_id, &numDevices);
-  //printf("Num of devices = %d\n", numDevices);
-  //for(unsigned i =0 ; i< numDevices; i++) {
-    //printf("Subdevice id %d = %p\n", i, subdevice_id[i]);
+  // printf("Device id = %p\n", clDevice);
+  // cl_device_partition_property props[3];
+  // props[0] = CL_DEVICE_PARTITION_EQUALLY;
+  // props[1] = 1;
+  // props[2] = 0;
+  // cl_device_id subdevice_id[8];
+  // cl_uint num_entries = 8;
+
+  // cl_uint numDevices;
+  // clCreateSubDevices(clDevice, props, num_entries, subdevice_id,
+  // &numDevices); printf("Num of devices = %d\n", numDevices); for(unsigned i
+  // =0 ; i< numDevices; i++) { printf("Subdevice id %d = %p\n", i,
+  // subdevice_id[i]);
   //}
-  //clDevice = subdevice_id[0];
- 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  //cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-  cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
+  // clDevice = subdevice_id[0];
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  // cl_context clContext =
+  // clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
+  cl_context clContext =
+      clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  //size_t binarySize = 1112;
-  //std::cout << "Size of binary = " << binarySize << "\n";
-  //cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice, &binarySize, (const unsigned char**)clSource, NULL, &clStatus);
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // size_t binarySize = 1112;
+  // std::cout << "Size of binary = " << binarySize << "\n";
+  // cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice,
+  // &binarySize, (const unsigned char**)clSource, NULL, &clStatus); cl_program
+  // clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "mysgemmNT", &clContext,
+                                    &clDevice, &clProgram, &clKernel);
   CHECK_ERROR("Binary")
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
-
-  //char clOptions[50];
-  //sprintf(clOptions,"");
-
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
+  // char clOptions[50];
+  // sprintf(clOptions,"");
 
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // Copy A and B^T into device memory
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-	matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+             matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clStatus = clReleaseKernel(clKernel);
   clStatus = clReleaseProgram(clProgram);
   clStatus = clReleaseMemObject(dA);
   clStatus = clReleaseMemObject(dB);
   clStatus = clReleaseMemObject(dC);
   clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext); 
- 
+  clStatus = clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-   
+
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc
index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
-	
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc
index 06d92225dc53780988fd2412cd8423ec0b9c1795..8de437a4f8935d5746dbcfbbe5345e0e66ae484a 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_cpu_sm/main.cc
@@ -6,42 +6,45 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
 #include <CL/cl.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 16
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                  cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                  int ldc, cl_kernel clKernel,
+                  cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     exit(1);
@@ -53,37 +56,37 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c
   }
 
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
     exit(1);
   }
-  
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
+
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
-  
+
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -93,133 +96,139 @@ main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      printf("%s\n",params->inpFiles[0]);
-      printf("%s\n",params->inpFiles[1]);
-      printf("%s\n",params->inpFiles[2]);
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    printf("%s\n", params->inpFiles[0]);
+    printf("%s\n", params->inpFiles[1]);
+    printf("%s\n", params->inpFiles[2]);
+    exit(-1);
+  }
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   cl_int clStatus;
 
-  cl_uint numPlatforms; 
+  cl_uint numPlatforms;
   clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
-  clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL);
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  cl_context clContext = clCreateContext(clCps, 1, &clDevice, NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  cl_context clContext =
+      clCreateContext(clCps, 1, &clDevice, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
-  
+
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_cpu_sm/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_cpu_sm/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
+  sprintf(clOptions, "-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d", TILE_N,
+          TILE_TB_HEIGHT, TILE_M);
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
 
-  B_sz = matBrow*matBcol*sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
+  std::vector<float> matC(matArow * matBcol);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-   
+
   // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-        matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
   // Use standard sgemm interface
-  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue);
+  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+               matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
- 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
   clReleaseMemObject(dB);
   clReleaseMemObject(dC);
   clReleaseCommandQueue(clCommandQueue);
-  clReleaseContext(clContext); 
-   
-  pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+  clReleaseContext(clContext);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
   if (params->outFile) {
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
-  
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc
index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
-	
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc
index fce77fed80666703863adc1a0e62ee9df6520e94..06f5da5c319811ebfc5aa8937559219b2feed625 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_nvidia/main.cc
@@ -6,42 +6,45 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
 #include <CL/cl.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 16
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                  cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                  int ldc, cl_kernel clKernel,
+                  cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     exit(1);
@@ -53,37 +56,37 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c
   }
 
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
     exit(1);
   }
-  
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
+
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
-  
+
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -93,120 +96,126 @@ main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      printf("%s\n",params->inpFiles[0]);
-      printf("%s\n",params->inpFiles[1]);
-      printf("%s\n",params->inpFiles[2]);
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    printf("%s\n", params->inpFiles[0]);
+    printf("%s\n", params->inpFiles[1]);
+    printf("%s\n", params->inpFiles[2]);
+    exit(-1);
+  }
 
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
-  
+
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
+  sprintf(clOptions, "-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d", TILE_N,
+          TILE_TB_HEIGHT, TILE_M);
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
   /* Read in data */
-  //pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // pb_SwitchToTimer(&timers, pb_TimerID_IO);
 
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-   
+
   // Copy A and B^T into device memory
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-        matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
   // Use standard sgemm interface
-  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue);
+  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+               matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
-    
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
   if (params->outFile) {
     /* Write C to file */
-    //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
   pb_FreeParameters(params);
 
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
@@ -214,7 +223,7 @@ main (int argc, char *argv[]) {
   clReleaseMemObject(dB);
   clReleaseMemObject(dC);
   clReleaseCommandQueue(clCommandQueue);
-  clReleaseContext(clContext); 
-  
+  clReleaseContext(clContext);
+
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc
index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
-	
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc
index 6949f86cafa3995db296b4e4486b4709020fb3e4..b22ebd8804bdb1204c42e2859aab69209dc77e4c 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8/main.cc
@@ -6,42 +6,45 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
 #include <CL/cl.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 16
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n";   \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                  cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                  int ldc, cl_kernel clKernel,
+                  cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     exit(1);
@@ -53,38 +56,38 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c
   }
 
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
     exit(1);
   }
-  
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
-  //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
+
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
+  // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
-  
+
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -94,142 +97,145 @@ main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      printf("%s\n",params->inpFiles[0]);
-      printf("%s\n",params->inpFiles[1]);
-      printf("%s\n",params->inpFiles[2]);
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    printf("%s\n", params->inpFiles[0]);
+    printf("%s\n", params->inpFiles[1]);
+    printf("%s\n", params->inpFiles[2]);
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // copy A to device memory
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
- 
+
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
-  
+
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_opt_8/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_opt_8/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
+  sprintf(clOptions, "-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d", TILE_N,
+          TILE_TB_HEIGHT, TILE_M);
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "mysgemmNT", &clStatus);
   CHECK_ERROR("clCreateKernel")
 
-  //cl_kernel clKernel;
-  //cl_program clProgram;
-  //pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  // cl_kernel clKernel;
+  // cl_program clProgram;
+  // pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_default/kernel.nvptx.s",
+  // "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  A_sz = matArow*matAcol*sizeof(float);
-  B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  for(size_t i=0;i<matC.size();i++)
-        matC[i] = 0.0f;
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-   
+
   // Copy A and B^T into device memory
-  //std::cout << "Copying " << A_sz << " bytes of data to device\n";
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  // std::cout << "Copying " << A_sz << " bytes of data to device\n";
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  //std::cout << "Copying " << B_sz << " bytes of data to device\n";
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  // std::cout << "Copying " << B_sz << " bytes of data to device\n";
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-
-  //std::cout << "Copying " << C_sz << " bytes of data to device\n";
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  // std::cout << "Copying " << C_sz << " bytes of data to device\n";
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue);
+  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+               matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
- 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
   clReleaseMemObject(dB);
   clReleaseMemObject(dC);
   clReleaseCommandQueue(clCommandQueue);
-  clReleaseContext(clContext); 
-  
-  
+  clReleaseContext(clContext);
+
   if (params->outFile) {
-    
+
     /* Write C to file */
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc
index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
-	
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc
index 4b385c7b57c87016500d63c5045f63894f7be347..a7cb9793e8c1ec991d5a3f3cd1676f7a88ff8e26 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_4K/main.cc
@@ -6,42 +6,45 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
 #include <CL/cl.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 8
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n";   \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers )
-{
+void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                  cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                  int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue,
+                  pb_TimerSet &timers) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     exit(1);
@@ -53,42 +56,41 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c
   }
 
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
     exit(1);
   }
-  
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
-  //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
+
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
+  // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
   pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  for(int i=0; i<4; i++) {
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  for (int i = 0; i < 4; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                      0, NULL, NULL);
     CHECK_ERROR("clEnqueueNDRangeKernel")
     clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
   }
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  
 }
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -98,136 +100,142 @@ main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      printf("%s\n",params->inpFiles[0]);
-      printf("%s\n",params->inpFiles[1]);
-      printf("%s\n",params->inpFiles[2]);
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    printf("%s\n", params->inpFiles[0]);
+    printf("%s\n", params->inpFiles[1]);
+    printf("%s\n", params->inpFiles[2]);
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
- 
+
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
-  
+
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
+  // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-  //char clOptions[50];
-  //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
+  // char clOptions[50];
+  // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D
+  // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
 
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_4K_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_opt_8_4K_default/kernel.nvptx.s", "mysgemmNT", &clContext,
+      &clDevice, &clProgram, &clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-   
+
   // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-        matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue, timers);
+  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+               matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
- 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
   clReleaseMemObject(dB);
   clReleaseMemObject(dC);
   clReleaseCommandQueue(clCommandQueue);
-  clReleaseContext(clContext); 
-  
+  clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-    
+
     /* Write C to file */
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc
index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
-	
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc
index f8fc432c7515d3c4051ad3a0c95915e013fffb32..713fd9e88966f885919bfba7df3bb0386c815f9a 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_medium/main.cc
@@ -6,42 +6,45 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
 #include <CL/cl.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 8
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n";   \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue, pb_TimerSet& timers )
-{
+void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                  cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                  int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue,
+                  pb_TimerSet &timers) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     exit(1);
@@ -53,42 +56,41 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c
   }
 
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
     exit(1);
   }
-  
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
-  //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
+
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
+  // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
   pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-  for(int i=0; i<200; i++) {
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  for (int i = 0; i < 200; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                      0, NULL, NULL);
     CHECK_ERROR("clEnqueueNDRangeKernel")
     clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
   }
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  
 }
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -98,136 +100,142 @@ main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      printf("%s\n",params->inpFiles[0]);
-      printf("%s\n",params->inpFiles[1]);
-      printf("%s\n",params->inpFiles[2]);
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    printf("%s\n", params->inpFiles[0]);
+    printf("%s\n", params->inpFiles[1]);
+    printf("%s\n", params->inpFiles[2]);
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
- 
+
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
-  
+
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
+  // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-  //char clOptions[50];
-  //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
+  // char clOptions[50];
+  // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D
+  // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
 
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_medium_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_opt_8_medium_default/kernel.nvptx.s", "mysgemmNT",
+      &clContext, &clDevice, &clProgram, &clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-   
+
   // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-        matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue, timers);
+  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+               matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue, timers);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
- 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
   clReleaseMemObject(dB);
   clReleaseMemObject(dC);
   clReleaseCommandQueue(clCommandQueue);
-  clReleaseContext(clContext); 
-  
+  clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-    
+
     /* Write C to file */
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc
index 04b6579d254bf6648d50870724558a5ce7773bca..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
-	
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc
index dfab43744f12bf754473b14569c7019c22b55888..7d5d75c53341060d5d61e21ffdd4d8123aa019a9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/opencl_opt_8_vec/main.cc
@@ -6,42 +6,45 @@
  *cr
  ***************************************************************************/
 
-/* 
+/*
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <parboil.h>
-#include <iostream>
 
 #include <CL/cl.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 8
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error! Errorcode = "<< clStatus <<"\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error! Errorcode = " << clStatus << "\n";   \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C, int ldc, cl_kernel clKernel, cl_command_queue clCommandQueue )
-{
+void regtileSgemm(char transa, char transb, int m, int n, int k, float alpha,
+                  cl_mem A, int lda, cl_mem B, int ldb, float beta, cl_mem C,
+                  int ldc, cl_kernel clKernel,
+                  cl_command_queue clCommandQueue) {
   if ((transa != 'N') && (transa != 'n')) {
     std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
     exit(1);
@@ -53,38 +56,38 @@ void regtileSgemm( char transa, char transb, int m, int n, int k, float alpha, c
   }
 
   // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
     exit(1);
   }
-  
-  size_t dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-  size_t db[2] = {TILE_N,TILE_TB_HEIGHT};
-  //printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
+
+  size_t dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
+  size_t db[2] = {TILE_N, TILE_TB_HEIGHT};
+  // printf("(%lu, %lu), (%lu, %lu)\n", db[0], db[1], dg[0], dg[1]);
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),(void*)&A);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),(void*)&lda);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&B);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),(void*)&ldb);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),(void*)&C);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ldc);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&k);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(float),(void*)&alpha);
-  clStatus = clSetKernelArg(clKernel,8,sizeof(float),(void*)&beta);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&A);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(int), (void *)&lda);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&B);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), (void *)&ldb);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), (void *)&C);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ldc);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&k);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(float), (void *)&alpha);
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(float), (void *)&beta);
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,2,NULL,dg,db,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 2, NULL, dg, db,
+                                    0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
-  
+
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int
-main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
   struct pb_Parameters *params;
   struct pb_TimerSet timers;
@@ -94,136 +97,142 @@ main (int argc, char *argv[]) {
   int matBrow, matBcol;
   std::vector<float> matA, matBT;
 
-
-  /* Read command line. Expect 3 inputs: A, B and B^T 
+  /* Read command line. Expect 3 inputs: A, B and B^T
      in column-major layout*/
   params = pb_ReadParameters(&argc, argv);
-  if ((params->inpFiles[0] == NULL) 
-      || (params->inpFiles[1] == NULL)
-      || (params->inpFiles[2] == NULL)
-      || (params->inpFiles[3] != NULL))
-    {
-      fprintf(stderr, "Expecting three input filenames\n");
-      printf("%s\n",params->inpFiles[0]);
-      printf("%s\n",params->inpFiles[1]);
-      printf("%s\n",params->inpFiles[2]);
-      exit(-1);
-    }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    printf("%s\n", params->inpFiles[0]);
+    printf("%s\n", params->inpFiles[1]);
+    printf("%s\n", params->inpFiles[2]);
+    exit(-1);
+  }
 
   /* Read in data */
   // load A
-  readColMajorMatrixFile(params->inpFiles[0],
-      matArow, matAcol, matA);
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
   // copy A to device memory
-  A_sz = matArow*matAcol*sizeof(float);
+  A_sz = matArow * matAcol * sizeof(float);
 
   // load B^T
-  readColMajorMatrixFile(params->inpFiles[2],
-      matBcol, matBrow, matBT);
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
   pb_InitializeTimerSet(&timers);
- 
+
   pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   cl_int clStatus;
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
-  
+
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  //const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  //CHECK_ERROR("clCreateProgramWithSource")
+  // const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
 
-  //char clOptions[50];
-  //sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
+  // char clOptions[50];
+  // sprintf(clOptions,"-D TILE_N=%d -D TILE_TB_HEIGHT=%d -D
+  // TILE_M=%d",TILE_N,TILE_TB_HEIGHT,TILE_M);
 
-  //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  //CHECK_ERROR("clBuildProgram")
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
 
-  //cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
-  //CHECK_ERROR("clCreateKernel")
+  // cl_kernel clKernel = clCreateKernel(clProgram,"mysgemmNT",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
 
   cl_kernel clKernel;
   cl_program clProgram;
-  pb_CreateAndBuildKernelFromBinary("build/opencl_opt_8_vec_default/kernel.nvptx.s", "mysgemmNT", &clContext, &clDevice, &clProgram, &clKernel);
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_opt_8_vec_default/kernel.nvptx.s", "mysgemmNT", &clContext,
+      &clDevice, &clProgram, &clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  B_sz = matBrow * matBcol * sizeof(float);
 
   // allocate space for C
-  C_sz = matArow*matBcol*sizeof(float);
+  C_sz = matArow * matBcol * sizeof(float);
 
   // OpenCL memory allocation
-  std::vector<float> matC(matArow*matBcol);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  cl_mem dA = clCreateBuffer(clContext,CL_MEM_READ_ONLY,A_sz,NULL,&clStatus);
+  std::vector<float> matC(matArow * matBcol);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  cl_mem dA =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, A_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dB = clCreateBuffer(clContext,CL_MEM_READ_ONLY,B_sz,NULL,&clStatus);
+  cl_mem dB =
+      clCreateBuffer(clContext, CL_MEM_READ_ONLY, B_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  cl_mem dC = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,C_sz,NULL,&clStatus);
+  cl_mem dC =
+      clCreateBuffer(clContext, CL_MEM_WRITE_ONLY, C_sz, NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-   
+
   // Copy A and B^T into device memory
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dA,CL_FALSE,0,A_sz,&matA.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dA, CL_FALSE, 0, A_sz,
+                                  &matA.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dB,CL_FALSE,0,B_sz,&matBT.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dB, CL_FALSE, 0, B_sz,
+                                  &matBT.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  for(int i=0;i<matC.size();i++)
-        matC[i] = 0.0f;
+  for (int i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz,
+                                  &matC.front(), 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
   // Use standard sgemm interface
-  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-      dA, matArow, dB, matBcol, 0.0f, dC, matArow,clKernel,clCommandQueue);
+  regtileSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, dA, matArow, dB,
+               matBcol, 0.0f, dC, matArow, clKernel, clCommandQueue);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clEnqueueReadBuffer(clCommandQueue,dC,CL_TRUE,0,C_sz,&matC.front(),0,NULL,NULL);
- 
-  pb_SwitchToTimer( &timers, visc_TimerID_SETUP);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clEnqueueReadBuffer(clCommandQueue, dC, CL_TRUE, 0, C_sz, &matC.front(), 0,
+                      NULL, NULL);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
   clReleaseKernel(clKernel);
   clReleaseProgram(clProgram);
   clReleaseMemObject(dA);
   clReleaseMemObject(dB);
   clReleaseMemObject(dC);
   clReleaseCommandQueue(clCommandQueue);
-  clReleaseContext(clContext); 
-  
+  clReleaseContext(clContext);
+
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
-  
+
   if (params->outFile) {
-    
+
     /* Write C to file */
     pb_SwitchToTimer(&timers, pb_TimerID_IO);
-    writeColMajorMatrixFile(params->outFile,
-	matArow, matBcol, matC); 
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
   }
 
-
   double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-  std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
   pb_FreeParameters(params);
 
-  //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
- 
   return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
index af9ee76e0fed3ced9e2666193afbd7c0631f1ce8..627f5a82412374cff4a9061620ce1f27ea3c14a6 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc/main.cc
@@ -10,272 +10,281 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <parboil.h>
 #include <visc.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_SZ 16
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
 typedef struct __attribute__((__packed__)) {
-    float* A; size_t bytes_A;
-    int lda;
-    float* B; size_t bytes_B;
-    int ldb;
-    float* C; size_t bytes_C;
-    int ldc;
-    int k; float alpha; float beta;
-    size_t dim_X1, dim_Y1, dim_X2, dim_Y2;
+  float *A;
+  size_t bytes_A;
+  int lda;
+  float *B;
+  size_t bytes_B;
+  int ldb;
+  float *C;
+  size_t bytes_C;
+  int ldc;
+  int k;
+  float alpha;
+  float beta;
+  size_t dim_X1, dim_Y1, dim_X2, dim_Y2;
 } RootIn;
 
-void mysgemmNT(
-    float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C,
-    int ldc, int k, float alpha, float beta
-) {
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(3, A, B, C, 1, C);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-    int gridx = __visc__getNumNodeInstances_x(thisNode);
-    int gridy = __visc__getNumNodeInstances_y(thisNode);
-    int m = gx * gridx + lx;
-    int n = gy * gridy + ly;
-
-    float c = 0.0f;
-    for (int i = 0; i < k; ++i) {
-        float a = A[m + i * lda];
-        float b = B[n + i * ldb];
-        c += a * b;
-    }
-    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
+void mysgemmNT(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
+               int ldb, float *C, size_t bytes_C, int ldc, int k, float alpha,
+               float beta) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(3, A, B, C, 1, C);
+
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
+  int lx = __visc__getNodeInstanceID_x(thisNode);
+  int ly = __visc__getNodeInstanceID_y(thisNode);
+  int gx = __visc__getNodeInstanceID_x(parentNode);
+  int gy = __visc__getNodeInstanceID_y(parentNode);
+  int gridx = __visc__getNumNodeInstances_x(thisNode);
+  int gridy = __visc__getNumNodeInstances_y(thisNode);
+  int m = gx * gridx + lx;
+  int n = gy * gridy + ly;
+
+  float c = 0.0f;
+  for (int i = 0; i < k; ++i) {
+    float a = A[m + i * lda];
+    float b = B[n + i * ldb];
+    c += a * b;
+  }
+  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c;
 }
 
-void basicSgemmLvl1(
-    float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C, int ldc,
-    int k, float alpha, float beta, size_t dim_X1, size_t dim_Y1
-) {
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(3, A, B, C, 1, C);
-    void* sgemm_node = __visc__createNodeND(2, mysgemmNT, (size_t) dim_X1, (size_t) dim_Y1);
-    __visc__bindIn(sgemm_node, 0, 0, 0);
-    __visc__bindIn(sgemm_node, 1, 1, 0);
-    __visc__bindIn(sgemm_node, 2, 2, 0);
-    __visc__bindIn(sgemm_node, 3, 3, 0);
-    __visc__bindIn(sgemm_node, 4, 4, 0);
-    __visc__bindIn(sgemm_node, 5, 5, 0);
-    __visc__bindIn(sgemm_node, 6, 6, 0);
-    __visc__bindIn(sgemm_node, 7, 7, 0);
-    __visc__bindIn(sgemm_node, 8, 8, 0);
-    __visc__bindIn(sgemm_node, 9, 9, 0);
-    __visc__bindIn(sgemm_node, 10, 10, 0);
-    __visc__bindIn(sgemm_node, 11, 11, 0);
+void basicSgemmLvl1(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
+                    int ldb, float *C, size_t bytes_C, int ldc, int k,
+                    float alpha, float beta, size_t dim_X1, size_t dim_Y1) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(3, A, B, C, 1, C);
+  void *sgemm_node =
+      __visc__createNodeND(2, mysgemmNT, (size_t)dim_X1, (size_t)dim_Y1);
+  __visc__bindIn(sgemm_node, 0, 0, 0);
+  __visc__bindIn(sgemm_node, 1, 1, 0);
+  __visc__bindIn(sgemm_node, 2, 2, 0);
+  __visc__bindIn(sgemm_node, 3, 3, 0);
+  __visc__bindIn(sgemm_node, 4, 4, 0);
+  __visc__bindIn(sgemm_node, 5, 5, 0);
+  __visc__bindIn(sgemm_node, 6, 6, 0);
+  __visc__bindIn(sgemm_node, 7, 7, 0);
+  __visc__bindIn(sgemm_node, 8, 8, 0);
+  __visc__bindIn(sgemm_node, 9, 9, 0);
+  __visc__bindIn(sgemm_node, 10, 10, 0);
+  __visc__bindIn(sgemm_node, 11, 11, 0);
 }
 
-void basicSgemmLvl2(
-    float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C, int ldc,
-    int k, float alpha, float beta,
-    size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2
-) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
-    void* sgemm_node = __visc__createNodeND(2, basicSgemmLvl1, (size_t) dim_X2, (size_t) dim_Y2);
-    __visc__bindIn(sgemm_node, 0, 0, 0);
-    __visc__bindIn(sgemm_node, 1, 1, 0);
-    __visc__bindIn(sgemm_node, 2, 2, 0);
-    __visc__bindIn(sgemm_node, 3, 3, 0);
-    __visc__bindIn(sgemm_node, 4, 4, 0);
-    __visc__bindIn(sgemm_node, 5, 5, 0);
-    __visc__bindIn(sgemm_node, 6, 6, 0);
-    __visc__bindIn(sgemm_node, 7, 7, 0);
-    __visc__bindIn(sgemm_node, 8, 8, 0);
-    __visc__bindIn(sgemm_node, 9, 9, 0);
-    __visc__bindIn(sgemm_node, 10, 10, 0);
-    __visc__bindIn(sgemm_node, 11, 11, 0);
-    __visc__bindIn(sgemm_node, 12, 12, 0);
-    __visc__bindIn(sgemm_node, 13, 13, 0);
+void basicSgemmLvl2(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
+                    int ldb, float *C, size_t bytes_C, int ldc, int k,
+                    float alpha, float beta, size_t dim_X1, size_t dim_Y1,
+                    size_t dim_X2, size_t dim_Y2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+  void *sgemm_node =
+      __visc__createNodeND(2, basicSgemmLvl1, (size_t)dim_X2, (size_t)dim_Y2);
+  __visc__bindIn(sgemm_node, 0, 0, 0);
+  __visc__bindIn(sgemm_node, 1, 1, 0);
+  __visc__bindIn(sgemm_node, 2, 2, 0);
+  __visc__bindIn(sgemm_node, 3, 3, 0);
+  __visc__bindIn(sgemm_node, 4, 4, 0);
+  __visc__bindIn(sgemm_node, 5, 5, 0);
+  __visc__bindIn(sgemm_node, 6, 6, 0);
+  __visc__bindIn(sgemm_node, 7, 7, 0);
+  __visc__bindIn(sgemm_node, 8, 8, 0);
+  __visc__bindIn(sgemm_node, 9, 9, 0);
+  __visc__bindIn(sgemm_node, 10, 10, 0);
+  __visc__bindIn(sgemm_node, 11, 11, 0);
+  __visc__bindIn(sgemm_node, 12, 12, 0);
+  __visc__bindIn(sgemm_node, 13, 13, 0);
 }
 
 // A wrapper level used in codegen for some backends
-void basicSgemmLvl3(
-    float* A, size_t bytes_A, int lda, float* B, size_t bytes_B, int ldb, float* C, size_t bytes_C, int ldc,
-    int k, float alpha, float beta,
-    size_t dim_X1, size_t dim_Y1, size_t dim_X2, size_t dim_Y2
-) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
-    void* sgemm_node = __visc__createNodeND(0, basicSgemmLvl2);
-    __visc__bindIn(sgemm_node, 0, 0, 0);
-    __visc__bindIn(sgemm_node, 1, 1, 0);
-    __visc__bindIn(sgemm_node, 2, 2, 0);
-    __visc__bindIn(sgemm_node, 3, 3, 0);
-    __visc__bindIn(sgemm_node, 4, 4, 0);
-    __visc__bindIn(sgemm_node, 5, 5, 0);
-    __visc__bindIn(sgemm_node, 6, 6, 0);
-    __visc__bindIn(sgemm_node, 7, 7, 0);
-    __visc__bindIn(sgemm_node, 8, 8, 0);
-    __visc__bindIn(sgemm_node, 9, 9, 0);
-    __visc__bindIn(sgemm_node, 10, 10, 0);
-    __visc__bindIn(sgemm_node, 11, 11, 0);
-    __visc__bindIn(sgemm_node, 12, 12, 0);
-    __visc__bindIn(sgemm_node, 13, 13, 0);
-    __visc__bindIn(sgemm_node, 14, 14, 0);
-    __visc__bindIn(sgemm_node, 15, 15, 0);
+void basicSgemmLvl3(float *A, size_t bytes_A, int lda, float *B, size_t bytes_B,
+                    int ldb, float *C, size_t bytes_C, int ldc, int k,
+                    float alpha, float beta, size_t dim_X1, size_t dim_Y1,
+                    size_t dim_X2, size_t dim_Y2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+  void *sgemm_node = __visc__createNodeND(0, basicSgemmLvl2);
+  __visc__bindIn(sgemm_node, 0, 0, 0);
+  __visc__bindIn(sgemm_node, 1, 1, 0);
+  __visc__bindIn(sgemm_node, 2, 2, 0);
+  __visc__bindIn(sgemm_node, 3, 3, 0);
+  __visc__bindIn(sgemm_node, 4, 4, 0);
+  __visc__bindIn(sgemm_node, 5, 5, 0);
+  __visc__bindIn(sgemm_node, 6, 6, 0);
+  __visc__bindIn(sgemm_node, 7, 7, 0);
+  __visc__bindIn(sgemm_node, 8, 8, 0);
+  __visc__bindIn(sgemm_node, 9, 9, 0);
+  __visc__bindIn(sgemm_node, 10, 10, 0);
+  __visc__bindIn(sgemm_node, 11, 11, 0);
+  __visc__bindIn(sgemm_node, 12, 12, 0);
+  __visc__bindIn(sgemm_node, 13, 13, 0);
+  __visc__bindIn(sgemm_node, 14, 14, 0);
+  __visc__bindIn(sgemm_node, 15, 15, 0);
 }
 
-__attribute__((noinline)) void basicSgemm(
-    char transa, char transb, int m, int n, int k, float alpha,
-    float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta,
-    float* C, size_t bytesC, int ldc
-) {
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_SZ) || (n%TILE_SZ)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-                  << "; n should be multiple of " << TILE_SZ << std::endl;
-    }
-
-    size_t db[2] = {TILE_SZ,TILE_SZ}, dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
-
-    void *root_in = malloc(sizeof(RootIn));
-    RootIn root_in_local = {
-        A, bytesA, lda,
-        B, bytesB, ldb,
-        C, bytesC, ldc,
-        k, alpha, beta,
-        db[0], db[1], dg[0]/db[0], dg[1]/db[1]
-    };
-    *(RootIn *)root_in = root_in_local;
-    void* sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in);
-    __visc__wait(sgemmDFG);
-}
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-int main (int argc, char *argv[]) {
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
+  }
+
+  size_t db[2] = {TILE_SZ, TILE_SZ},
+         dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
+
+  void *root_in = malloc(sizeof(RootIn));
+  RootIn root_in_local = {A,
+                          bytesA,
+                          lda,
+                          B,
+                          bytesB,
+                          ldb,
+                          C,
+                          bytesC,
+                          ldc,
+                          k,
+                          alpha,
+                          beta,
+                          db[0],
+                          db[1],
+                          dg[0] / db[0],
+                          dg[1] / db[1]};
+  *(RootIn *)root_in = root_in_local;
+  void *sgemmDFG = __visc__launch(0, basicSgemmLvl3, root_in);
+  __visc__wait(sgemmDFG);
+}
 
-    size_t A_sz, B_sz, C_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
+int main(int argc, char *argv[]) {
 
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
 
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
 
-    unsigned iter = 0;
-    while(params->inpFiles[iter] != NULL) {
-        printf("Found input file %d - %s\n", iter, params->inpFiles[iter]);
-        iter++;
-    }
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        printf("Expecting three input filenames\n");
-        exit(-1);
-        return 0;
-    }
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
 
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
+  unsigned iter = 0;
+  while (params->inpFiles[iter] != NULL) {
+    printf("Found input file %d - %s\n", iter, params->inpFiles[iter]);
+    iter++;
+  }
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    printf("Expecting three input filenames\n");
+    exit(-1);
+    return 0;
+  }
 
-    printf("This is in between two reads\n");
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  printf("This is in between two reads\n");
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
 
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
 
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
+  llvm_visc_track_mem(&matA.front(), A_sz);
+  llvm_visc_track_mem(&matBT.front(), B_sz);
+  llvm_visc_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-    // Use standard sgemm interface
-    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
 
-    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(&matC.front(), C_sz);
 
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  llvm_visc_untrack_mem(&matA.front());
+  llvm_visc_untrack_mem(&matBT.front());
+  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
 
-    if (params->outFile) {
+  if (params->outFile) {
 
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc
index 96ffcaddddbcddcd3b75903a23dbfc6c944a8cbf..62f9285e8a8054e5597fe45adc5257470b147622 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_opt/main.cc
@@ -10,178 +10,177 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <parboil.h>
 #include <visc.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 16
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
-{
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __visc__hint(visc::GPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
 
-    float c[TILE_N];
-    for (int i=0; i < TILE_N; i++)
-	c[i] = 0.0f;
-   
-    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
-    int m = get_group_id(0) * TILE_M + mid;
+  float c[TILE_N];
+  for (int i = 0; i < TILE_N; i++)
+    c[i] = 0.0f;
 
-    int b_base = 0;
+  int mid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+  int m = get_group_id(0) * TILE_M + mid;
 
-    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-	float a; 
-        b_base = get_group_id(1) * TILE_N + i * ldb;
+  int b_base = 0;
 
-	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-	    a = A[m + (i+j)*lda];
-	    for (int kk = 0; kk < TILE_N; kk++)
-		c[kk] += a * B[b_base + j * ldb + kk];
+  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
+    float a;
+    b_base = get_group_id(1) * TILE_N + i * ldb;
 
-	}
-    }
-    int t = ldc * get_group_id(1) * TILE_N + m;
-    for (int i = 0; i < TILE_N; i++) {
-	C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
+    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+      a = A[m + (i + j) * lda];
+      for (int kk = 0; kk < TILE_N; kk++)
+        c[kk] += a * B[b_base + j * ldb + kk];
     }
+  }
+  int t = ldc * get_group_id(1) * TILE_N + m;
+  for (int i = 0; i < TILE_N; i++) {
+    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
+  }
 }
 
-__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
+    return;
+  }
 
-    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-    unsigned dg[2] = {m*db[0]/TILE_M,n*db[1]/TILE_N};
+  unsigned db[2] = {TILE_N, TILE_TB_HEIGHT};
+  //    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
+  unsigned dg[2] = {m * db[0] / TILE_M, n * db[1] / TILE_N};
 
-    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-    __visc__wait(sgemmDFG);
+  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __visc__wait(sgemmDFG);
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
 
-    size_t A_sz, B_sz, C_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
 
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
-
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
-
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
 
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
 
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
+  llvm_visc_track_mem(&matA.front(), A_sz);
+  llvm_visc_track_mem(&matBT.front(), B_sz);
+  llvm_visc_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-    // Use standard sgemm interface
-    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
 
-    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(&matC.front(), C_sz);
 
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  llvm_visc_untrack_mem(&matA.front());
+  llvm_visc_untrack_mem(&matBT.front());
+  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
 
-    if (params->outFile) {
+  if (params->outFile) {
 
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc
index 16f2341a2203e3510b9c00a91eedd3ac53d296d4..05d143b5884164926213ca060da341a254399bf3 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_sh/main.cc
@@ -10,377 +10,341 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <parboil.h>
 #include <visc.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 16
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
+
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
 typedef struct __attribute__((__packed__)) {
-    float *A;
-    size_t bytesA;
-    int lda;
-    float *B;
-    size_t bytesB;
-    int ldb;
-    float *C;
-    size_t bytesC;
-    int ldc;
-    int k;
-    float alpha;
-    float beta;
-    long block_x;
-    long block_y;
-    long grid_x;
-    long grid_y;
-}
-RootIn;
-
-void packData(RootIn* args,
-              float *A, size_t bytesA,
-              int lda,
-              float *B, size_t bytesB,
-              int ldb,
-              float *C, size_t bytesC,
-              int ldc,
-              int k,
-              float alpha,
-              float beta,
-              long block_x,
-              long block_y,
-              long grid_x,
+  float *A;
+  size_t bytesA;
+  int lda;
+  float *B;
+  size_t bytesB;
+  int ldb;
+  float *C;
+  size_t bytesC;
+  int ldc;
+  int k;
+  float alpha;
+  float beta;
+  long block_x;
+  long block_y;
+  long grid_x;
+  long grid_y;
+} RootIn;
+
+void packData(RootIn *args, float *A, size_t bytesA, int lda, float *B,
+              size_t bytesB, int ldb, float *C, size_t bytesC, int ldc, int k,
+              float alpha, float beta, long block_x, long block_y, long grid_x,
               long grid_y) {
-    args->A = A;
-    args->bytesA = bytesA;
-    args->lda = lda;
-    args->B = B;
-    args->bytesB = bytesB;
-    args->ldb = ldb;
-    args->C = C;
-    args->bytesC = bytesC;
-    args->ldc = ldc;
-    args->k = k;
-    args->alpha = alpha;
-    args->beta = beta;
-    args->block_x = block_x;
-    args->block_y = block_y;
-    args->grid_x = grid_x;
-    args->grid_y = grid_y;
+  args->A = A;
+  args->bytesA = bytesA;
+  args->lda = lda;
+  args->B = B;
+  args->bytesB = bytesB;
+  args->ldb = ldb;
+  args->C = C;
+  args->bytesC = bytesC;
+  args->ldc = ldc;
+  args->k = k;
+  args->alpha = alpha;
+  args->beta = beta;
+  args->block_x = block_x;
+  args->block_y = block_y;
+  args->grid_x = grid_x;
+  args->grid_y = grid_y;
 }
 
 void Allocation(long block_x, long block_y) {
-    void* shB = __visc__malloc(block_x*block_y*sizeof(float));
-    __visc__return(2, shB, block_x*block_y*sizeof(float));
+  void *shB = __visc__malloc(block_x * block_y * sizeof(float));
+  __visc__return(2, shB, block_x * block_y * sizeof(float));
 }
 
+void SgemmLeaf(float *A, size_t bytesA, int lda, float *B, size_t bytesB,
+               int ldb, float *C, size_t bytesC, int ldc, int k, float alpha,
+               float beta, float *shB, size_t bytesshB) {
+  __visc__hint(visc::DEVICE);
+  //__visc__hint(visc::SPIR_TARGET);
+  //__visc__hint(visc::GPU_TARGET);
 
-void SgemmLeaf( float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float* C, size_t bytesC, int ldc, int k, float alpha, float beta, float* shB, size_t bytesshB) {
-    __visc__hint(visc::DEVICE);
-    //__visc__hint(visc::SPIR_TARGET);
-    //__visc__hint(visc::GPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
 
-    __visc__attributes(3, A, B, C, 1, C);
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
 
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
+  long lx = __visc__getNodeInstanceID_x(thisNode);
+  long ly = __visc__getNodeInstanceID_y(thisNode);
 
-    long lx = __visc__getNodeInstanceID_x(thisNode);
-    long ly = __visc__getNodeInstanceID_y(thisNode);
+  long gx = __visc__getNodeInstanceID_x(parentNode);
+  long gy = __visc__getNodeInstanceID_y(parentNode);
 
-    long gx = __visc__getNodeInstanceID_x(parentNode);
-    long gy = __visc__getNodeInstanceID_y(parentNode);
+  long dimx = __visc__getNumNodeInstances_x(thisNode);
 
-    long dimx = __visc__getNumNodeInstances_x(thisNode);
+  float c[TILE_N];
+  for (int i = 0; i < TILE_N; i++)
+    c[i] = 0.0f;
 
-    float c[TILE_N];
-    for (int i=0; i < TILE_N; i++)
-        c[i] = 0.0f;
+  int mid = ly * dimx + lx;
+  int m = gx * TILE_M + mid;
+  int n = gy * TILE_N + lx;
 
-    int mid = ly*dimx+lx;
-    int m = gx * TILE_M + mid;
-    int n = gy * TILE_N + lx;
+  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
+    float a;
+    // shB[ly][lx] = B[n+(i+ly)*ldb];
+    shB[ly * dimx + lx] = B[n + (i + ly) * ldb];
 
-    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-        float a;
-        //shB[ly][lx] = B[n+(i+ly)*ldb];
-        shB[ly*dimx+lx] = B[n+(i+ly)*ldb];
-
-        __visc__barrier();
-        for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-            a = A[m + (i+j)*lda];
-            for (int kk = 0; kk < TILE_N; kk++) {
-                //c[kk] += a * shB[j][kk];
-                c[kk] += a * shB[j*dimx+kk];
-            }
-        }
-        __visc__barrier();
+    __visc__barrier();
+    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+      a = A[m + (i + j) * lda];
+      for (int kk = 0; kk < TILE_N; kk++) {
+        // c[kk] += a * shB[j][kk];
+        c[kk] += a * shB[j * dimx + kk];
+      }
     }
+    __visc__barrier();
+  }
 
-    int t = ldc * gy * TILE_N + m;
-    for (int i = 0; i < TILE_N; i++) {
-        C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
-    }
+  int t = ldc * gy * TILE_N + m;
+  for (int i = 0; i < TILE_N; i++) {
+    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
+  }
 }
 
 // Work group node for sgemm - Creates allocation node and leaf (work item) node
-void SgemmTB(float *A, size_t bytesA,
-             int lda,
-             float *B, size_t bytesB,
-             int ldb,
-             float *C, size_t bytesC,
-             int ldc,
-             int k,
-             float alpha,
-             float beta,
-             long block_x,
-             long block_y) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
-    void* AllocationNode = __visc__createNodeND(0, Allocation);
-    void* SgemmLeafNode = __visc__createNodeND(2, SgemmLeaf, block_x, block_y);
-
-    // Bind edges
-    __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A
-    __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda
-    __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B
-    __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB
-    __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb
-    __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C
-    __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC
-    __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc
-    __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k
-    __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
-    __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
-
-    __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
-    __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
-
-
-    // Create Edges between AllocationNode and BFSLeafNodeNode
-    __visc__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B
-    __visc__edge(AllocationNode, SgemmLeafNode, 1, 1, 13, 0); // Edge bytes_local_B
-
+void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb,
+             float *C, size_t bytesC, int ldc, int k, float alpha, float beta,
+             long block_x, long block_y) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+  void *AllocationNode = __visc__createNodeND(0, Allocation);
+  void *SgemmLeafNode = __visc__createNodeND(2, SgemmLeaf, block_x, block_y);
+
+  // Bind edges
+  __visc__bindIn(SgemmLeafNode, 0, 0, 0);   // Bind A
+  __visc__bindIn(SgemmLeafNode, 1, 1, 0);   // Bind bytesA
+  __visc__bindIn(SgemmLeafNode, 2, 2, 0);   // Bind lda
+  __visc__bindIn(SgemmLeafNode, 3, 3, 0);   // Bind B
+  __visc__bindIn(SgemmLeafNode, 4, 4, 0);   // Bind bytesB
+  __visc__bindIn(SgemmLeafNode, 5, 5, 0);   // Bind ldb
+  __visc__bindIn(SgemmLeafNode, 6, 6, 0);   // Bind C
+  __visc__bindIn(SgemmLeafNode, 7, 7, 0);   // Bind bytesC
+  __visc__bindIn(SgemmLeafNode, 8, 8, 0);   // Bind ldc
+  __visc__bindIn(SgemmLeafNode, 9, 9, 0);   // Bind k
+  __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
+  __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
+
+  __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
+  __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
+
+  // Create Edges between AllocationNode and BFSLeafNodeNode
+  __visc__edge(AllocationNode, SgemmLeafNode, 1, 0, 12, 0); // Edge local_B
+  __visc__edge(AllocationNode, SgemmLeafNode, 1, 1, 13,
+               0); // Edge bytes_local_B
 }
 
 // Root node for sgemm - Creates work group node
-void SgemmRoot(
-    float *A, size_t bytesA, int lda, // 0-2
-    float *B, size_t bytesB, int ldb, // 3-5
-    float *C, size_t bytesC, int ldc, // 6-8
-    int k, float alpha, float beta, // 9-11
-    long block_x, long block_y, long grid_x, long grid_y // 12-15
+void SgemmRoot(float *A, size_t bytesA, int lda,                    // 0-2
+               float *B, size_t bytesB, int ldb,                    // 3-5
+               float *C, size_t bytesC, int ldc,                    // 6-8
+               int k, float alpha, float beta,                      // 9-11
+               long block_x, long block_y, long grid_x, long grid_y // 12-15
 ) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
-    void* SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y);
-
-    // Bind edges
-    __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A
-    __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda
-    __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B
-    __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB
-    __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb
-    __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C
-    __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC
-    __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc
-    __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k
-    __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
-    __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
-    __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
-    __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
-
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+  void *SgemmTBNode = __visc__createNodeND(2, SgemmTB, grid_x, grid_y);
+
+  // Bind edges
+  __visc__bindIn(SgemmTBNode, 0, 0, 0);   // Bind A
+  __visc__bindIn(SgemmTBNode, 1, 1, 0);   // Bind bytesA
+  __visc__bindIn(SgemmTBNode, 2, 2, 0);   // Bind lda
+  __visc__bindIn(SgemmTBNode, 3, 3, 0);   // Bind B
+  __visc__bindIn(SgemmTBNode, 4, 4, 0);   // Bind bytesB
+  __visc__bindIn(SgemmTBNode, 5, 5, 0);   // Bind ldb
+  __visc__bindIn(SgemmTBNode, 6, 6, 0);   // Bind C
+  __visc__bindIn(SgemmTBNode, 7, 7, 0);   // Bind bytesC
+  __visc__bindIn(SgemmTBNode, 8, 8, 0);   // Bind ldc
+  __visc__bindIn(SgemmTBNode, 9, 9, 0);   // Bind k
+  __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
+  __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
+  __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
+  __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
 }
 
-void SgemmWrapper(
-    float *A, size_t bytesA, int lda, // 0-2
-    float *B, size_t bytesB, int ldb, // 3-5
-    float *C, size_t bytesC, int ldc, // 6-8
-    int k, float alpha, float beta, // 9-11
-    long block_x, long block_y, long grid_x, long grid_y // 12-15
+void SgemmWrapper(float *A, size_t bytesA, int lda,                    // 0-2
+                  float *B, size_t bytesB, int ldb,                    // 3-5
+                  float *C, size_t bytesC, int ldc,                    // 6-8
+                  int k, float alpha, float beta,                      // 9-11
+                  long block_x, long block_y, long grid_x, long grid_y // 12-15
 ) {
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
-    void* SgemmRootNode = __visc__createNodeND(0, SgemmRoot);
-
-    // Bind edges
-    __visc__bindIn(SgemmRootNode, 0, 0, 0); // Bind A
-    __visc__bindIn(SgemmRootNode, 1, 1, 0); // Bind bytesA
-    __visc__bindIn(SgemmRootNode, 2, 2, 0); // Bind lda
-    __visc__bindIn(SgemmRootNode, 3, 3, 0); // Bind B
-    __visc__bindIn(SgemmRootNode, 4, 4, 0); // Bind bytesB
-    __visc__bindIn(SgemmRootNode, 5, 5, 0); // Bind ldb
-    __visc__bindIn(SgemmRootNode, 6, 6, 0); // Bind C
-    __visc__bindIn(SgemmRootNode, 7, 7, 0); // Bind bytesC
-    __visc__bindIn(SgemmRootNode, 8, 8, 0); // Bind ldc
-    __visc__bindIn(SgemmRootNode, 9, 9, 0); // Bind k
-    __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha
-    __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta
-    __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x
-    __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y
-    __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x
-    __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+  void *SgemmRootNode = __visc__createNodeND(0, SgemmRoot);
+
+  // Bind edges
+  __visc__bindIn(SgemmRootNode, 0, 0, 0);   // Bind A
+  __visc__bindIn(SgemmRootNode, 1, 1, 0);   // Bind bytesA
+  __visc__bindIn(SgemmRootNode, 2, 2, 0);   // Bind lda
+  __visc__bindIn(SgemmRootNode, 3, 3, 0);   // Bind B
+  __visc__bindIn(SgemmRootNode, 4, 4, 0);   // Bind bytesB
+  __visc__bindIn(SgemmRootNode, 5, 5, 0);   // Bind ldb
+  __visc__bindIn(SgemmRootNode, 6, 6, 0);   // Bind C
+  __visc__bindIn(SgemmRootNode, 7, 7, 0);   // Bind bytesC
+  __visc__bindIn(SgemmRootNode, 8, 8, 0);   // Bind ldc
+  __visc__bindIn(SgemmRootNode, 9, 9, 0);   // Bind k
+  __visc__bindIn(SgemmRootNode, 10, 10, 0); // Bind alpha
+  __visc__bindIn(SgemmRootNode, 11, 11, 0); // Bind beta
+  __visc__bindIn(SgemmRootNode, 12, 12, 0); // Bind block_x
+  __visc__bindIn(SgemmRootNode, 13, 13, 0); // Bind block_y
+  __visc__bindIn(SgemmRootNode, 14, 14, 0); // Bind grid_x
+  __visc__bindIn(SgemmRootNode, 15, 15, 0); // Bind grid_y
 }
 
 // Creates root node for sgemm
-__attribute__((noinline)) void basicSgemm(struct pb_TimerSet* timers, char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
+__attribute__((noinline)) void basicSgemm(struct pb_TimerSet *timers,
+                                          char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_M) || (n%TILE_N)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-                  << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
+    return;
+  }
+
+  //    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
+  //    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
+
+  long block_x = TILE_N;
+  long block_y = TILE_TB_HEIGHT;
+  long grid_x = m / TILE_M;
+  long grid_y = n / TILE_N;
 
-//    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-//    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
-
-    long block_x = TILE_N;
-    long block_y = TILE_TB_HEIGHT;
-    long grid_x = m/TILE_M;
-    long grid_y = n/TILE_N;
-
-    // Pack data in struct
-    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-    packData(args,
-             A, bytesA,
-             lda,
-             B, bytesB,
-             ldb,
-             C, bytesC,
-             ldc,
-             k,
-             alpha,
-             beta,
-             block_x,
-             block_y,
-             grid_x,
-             grid_y
-            );
-
-    pb_SwitchToTimer( timers, visc_TimerID_COMPUTATION );
-    void* sgemmDFG = __visc__launch(0, SgemmWrapper, (void*) args);
-
-    __visc__wait(sgemmDFG);
-    pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
+  // Pack data in struct
+  RootIn *args = (RootIn *)malloc(sizeof(RootIn));
+  packData(args, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta,
+           block_x, block_y, grid_x, grid_y);
+
+  pb_SwitchToTimer(timers, visc_TimerID_COMPUTATION);
+  void *sgemmDFG = __visc__launch(0, SgemmWrapper, (void *)args);
+
+  __visc__wait(sgemmDFG);
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
 }
 
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
-
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
+int main(int argc, char *argv[]) {
 
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
 
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
 
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
 
-    // Use standard sgemm interface
-    basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol,
-               0.0f, &matC.front(), C_sz, matArow);
+  llvm_visc_track_mem(&matA.front(), A_sz);
+  llvm_visc_track_mem(&matBT.front(), B_sz);
+  llvm_visc_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  // Use standard sgemm interface
+  basicSgemm(&timers, 'N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(),
+             A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(),
+             C_sz, matArow);
 
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(&matC.front(), C_sz);
 
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  llvm_visc_untrack_mem(&matA.front());
+  llvm_visc_untrack_mem(&matBT.front());
+  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    if (params->outFile) {
-      /* Write C to file */
-      //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
-    }
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
+
+  if (params->outFile) {
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc
index 71a615026f979a70ffb7d99341e3e5a1ba23e8b2..0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc/main.cc
@@ -10,168 +10,171 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <parboil.h>
 #include <visc.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_SZ 16
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
-{
-    __visc__attributes(3, A, B, C, 1, C);
-    float c0, c1, c2, c3;
-    c0 = c1 = c2 = c3 = 0.0f;
-    int m = 4 * get_global_id(0);
-    int n = get_global_id(1);
-
-    for (int i = 0; i < k; ++i) {
-        float a0 = A[m + i * lda]; 
-        float a1 = A[m + 1 + i * lda]; 
-        float a2 = A[m + 2 + i * lda]; 
-        float a3 = A[m + 3 + i * lda]; 
-
-        float b = B[n + i * ldb];
-
-        c0 += a0 * b;
-        c1 += a1 * b;
-        c2 += a2 * b;
-        c3 += a3 * b;
-    }
-    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0;
-    C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1;
-    C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2;
-    C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3;
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __visc__attributes(3, A, B, C, 1, C);
+  float c0, c1, c2, c3;
+  c0 = c1 = c2 = c3 = 0.0f;
+  int m = 4 * get_global_id(0);
+  int n = get_global_id(1);
+
+  for (int i = 0; i < k; ++i) {
+    float a0 = A[m + i * lda];
+    float a1 = A[m + 1 + i * lda];
+    float a2 = A[m + 2 + i * lda];
+    float a3 = A[m + 3 + i * lda];
+
+    float b = B[n + i * ldb];
+
+    c0 += a0 * b;
+    c1 += a1 * b;
+    c2 += a2 * b;
+    c3 += a3 * b;
+  }
+  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
+  C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
+  C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
+  C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
 }
 
-__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_SZ) || (n%TILE_SZ)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-                  << "; n should be multiple of " << TILE_SZ << std::endl;
-    }
-
-    unsigned db[2] = {TILE_SZ/4,TILE_SZ};
-    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
-
-    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-    __visc__wait(sgemmDFG);
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
+  }
+
+  unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
+  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
+
+  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __visc__wait(sgemmDFG);
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
 
-    size_t A_sz, B_sz, C_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
-    /* Read in data */
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  /* Read in data */
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
 
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
 
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
 
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  llvm_visc_track_mem(&matA.front(), A_sz);
+  llvm_visc_track_mem(&matBT.front(), B_sz);
+  llvm_visc_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    // Use standard sgemm interface
-    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
 
-    if (params->outFile) {
-        pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  if (params->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-        /* Write C to file */
-        llvm_visc_request_mem(&matC.front(), C_sz);
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
+    /* Write C to file */
+    llvm_visc_request_mem(&matC.front(), C_sz);
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
 
-    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  llvm_visc_untrack_mem(&matA.front());
+  llvm_visc_untrack_mem(&matBT.front());
+  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-    pb_FreeParameters(params);
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
+  pb_FreeParameters(params);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc
index 71a615026f979a70ffb7d99341e3e5a1ba23e8b2..0dfcdfb835e73fb2a0c7db9d1f24e67b11375ad8 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_tc_vec/main.cc
@@ -10,168 +10,171 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <parboil.h>
 #include <visc.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_SZ 16
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
-{
-    __visc__attributes(3, A, B, C, 1, C);
-    float c0, c1, c2, c3;
-    c0 = c1 = c2 = c3 = 0.0f;
-    int m = 4 * get_global_id(0);
-    int n = get_global_id(1);
-
-    for (int i = 0; i < k; ++i) {
-        float a0 = A[m + i * lda]; 
-        float a1 = A[m + 1 + i * lda]; 
-        float a2 = A[m + 2 + i * lda]; 
-        float a3 = A[m + 3 + i * lda]; 
-
-        float b = B[n + i * ldb];
-
-        c0 += a0 * b;
-        c1 += a1 * b;
-        c2 += a2 * b;
-        c3 += a3 * b;
-    }
-    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c0;
-    C[m+1+n*ldc] = C[m+1+n*ldc] * beta + alpha * c1;
-    C[m+2+n*ldc] = C[m+2+n*ldc] * beta + alpha * c2;
-    C[m+3+n*ldc] = C[m+3+n*ldc] * beta + alpha * c3;
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __visc__attributes(3, A, B, C, 1, C);
+  float c0, c1, c2, c3;
+  c0 = c1 = c2 = c3 = 0.0f;
+  int m = 4 * get_global_id(0);
+  int n = get_global_id(1);
+
+  for (int i = 0; i < k; ++i) {
+    float a0 = A[m + i * lda];
+    float a1 = A[m + 1 + i * lda];
+    float a2 = A[m + 2 + i * lda];
+    float a3 = A[m + 3 + i * lda];
+
+    float b = B[n + i * ldb];
+
+    c0 += a0 * b;
+    c1 += a1 * b;
+    c2 += a2 * b;
+    c3 += a3 * b;
+  }
+  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c0;
+  C[m + 1 + n * ldc] = C[m + 1 + n * ldc] * beta + alpha * c1;
+  C[m + 2 + n * ldc] = C[m + 2 + n * ldc] * beta + alpha * c2;
+  C[m + 3 + n * ldc] = C[m + 3 + n * ldc] * beta + alpha * c3;
 }
 
-__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_SZ) || (n%TILE_SZ)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-                  << "; n should be multiple of " << TILE_SZ << std::endl;
-    }
-
-    unsigned db[2] = {TILE_SZ/4,TILE_SZ};
-    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
-
-    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-    __visc__wait(sgemmDFG);
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
+  }
+
+  unsigned db[2] = {TILE_SZ / 4, TILE_SZ};
+  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
+
+  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __visc__wait(sgemmDFG);
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
 
-    size_t A_sz, B_sz, C_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
-    /* Read in data */
-    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  /* Read in data */
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
 
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
 
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
 
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  llvm_visc_track_mem(&matA.front(), A_sz);
+  llvm_visc_track_mem(&matBT.front(), B_sz);
+  llvm_visc_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    // Use standard sgemm interface
-    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
 
-    if (params->outFile) {
-        pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  if (params->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-        /* Write C to file */
-        llvm_visc_request_mem(&matC.front(), C_sz);
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
+    /* Write C to file */
+    llvm_visc_request_mem(&matC.front(), C_sz);
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
 
-    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  llvm_visc_untrack_mem(&matA.front());
+  llvm_visc_untrack_mem(&matBT.front());
+  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
-    pb_FreeParameters(params);
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
+  pb_FreeParameters(params);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc
index 82e3cadcb56c7c942c5d359ffe33c6bb133af870..76d0cefc817ea28f2ffb15cd48d8dd5c7a97d0e0 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec/main.cc
@@ -10,179 +10,180 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <parboil.h>
 #include <visc.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_SZ 16
 #define VEC_SZ 8
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
   }
 
-void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
-{
-    __visc__hint(visc::GPU_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
-
-    float c = 0.0f;
-    int m = get_global_id(0);
-    int n = get_global_id(1);
-
-    for (int i = 0; i < k; ++i) {
-	float a = A[m + i * lda]; 
-	float b = B[n + i * ldb];
-	c += a * b;
-    }
-    C[m+n*ldc] = C[m+n*ldc] * beta + alpha * c;
-/*
-    Will be substituted by this kernel at the llvm level
-    // Partial results 
-    float8 cp = (float8)(0.0f);
-
-    int m = get_global_id(0) * 8;
-    int n = get_global_id(1);
-
-    for (int i = 0; i < k; ++i) {
-        float8 a = vload8(0, A + (m + i * lda));
-        float8 b = (float8)(B[n + i * ldb]);
-        cp += a * b;
-    }
-
-    float8 c = vload8(0, C + (m+n*ldc));
-    c = c * beta + alpha * cp;
-    vstore8(c, 0, C + (m+n*ldc));
-*/
-}
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __visc__hint(visc::GPU_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
+
+  float c = 0.0f;
+  int m = get_global_id(0);
+  int n = get_global_id(1);
 
-__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
-
-    // In this code we assume the matrix sizes are multiple of tile size
-    if ((m%TILE_SZ) || (n%TILE_SZ)) {
-        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_SZ
-                  << "; n should be multiple of " << TILE_SZ << std::endl;
-    }
-
-    unsigned db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
-    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
-
-    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-    __visc__wait(sgemmDFG);
+  for (int i = 0; i < k; ++i) {
+    float a = A[m + i * lda];
+    float b = B[n + i * ldb];
+    c += a * b;
+  }
+  C[m + n * ldc] = C[m + n * ldc] * beta + alpha * c;
+  /*
+      Will be substituted by this kernel at the llvm level
+      // Partial results
+      float8 cp = (float8)(0.0f);
+
+      int m = get_global_id(0) * 8;
+      int n = get_global_id(1);
+
+      for (int i = 0; i < k; ++i) {
+          float8 a = vload8(0, A + (m + i * lda));
+          float8 b = (float8)(B[n + i * ldb]);
+          cp += a * b;
+      }
+
+      float8 c = vload8(0, C + (m+n*ldc));
+      c = c * beta + alpha * cp;
+      vstore8(c, 0, C + (m+n*ldc));
+  */
 }
 
-int main (int argc, char *argv[]) {
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
+
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_SZ) || (n % TILE_SZ)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_SZ << "; n should be multiple of " << TILE_SZ
+              << std::endl;
+  }
 
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
+  unsigned db[2] = {TILE_SZ / VEC_SZ, TILE_SZ};
+  unsigned dg[2] = {m / TILE_SZ * db[0], n / TILE_SZ * db[1]};
 
-    size_t A_sz, B_sz, C_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
+  unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                   dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                   ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __visc__wait(sgemmDFG);
+}
 
+int main(int argc, char *argv[]) {
 
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
 
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
 
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
 
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+  llvm_visc_track_mem(&matA.front(), A_sz);
+  llvm_visc_track_mem(&matBT.front(), B_sz);
+  llvm_visc_track_mem(&matC.front(), C_sz);
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    // Use standard sgemm interface
-    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
 
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(&matC.front(), C_sz);
 
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  llvm_visc_untrack_mem(&matA.front());
+  llvm_visc_untrack_mem(&matBT.front());
+  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    if (params->outFile) {
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
 
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
+  if (params->outFile) {
+
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc
index 19f72c9dff35885b9e2c1f8c38502ac59fb6ab6b..a4c252d8f183e76f91349d97872dbca0b3766acf 100644
--- a/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc
+++ b/hpvm/test/parboil/benchmarks/sgemm/src/visc_vec_opt/main.cc
@@ -10,219 +10,218 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include <stdio.h>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
 #include <vector>
-#include <iostream>
-#include <parboil.h>
 #include <visc.h>
 
 // I/O routines
-extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v);
-extern bool writeColMajorMatrixFile(const char *fn, int, int, std::vector<float>&);
-extern char* readFile(const char*);
+extern bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                                   std::vector<float> &v);
+extern bool writeColMajorMatrixFile(const char *fn, int, int,
+                                    std::vector<float> &);
+extern char *readFile(const char *);
 
 // Parameters of tile sizes
 #define TILE_N 8
 #define TILE_TB_HEIGHT 8
-#define TILE_M (TILE_N*TILE_TB_HEIGHT)
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     std::cout<<errorMessage<<" Error!\n";  \
-     std::cout<<"Line: "<<__LINE__<<"\n";   \
-     exit(1);                               \
-  }
+#define TILE_M (TILE_N * TILE_TB_HEIGHT)
 
-void mysgemmNT( float* A, int lda, float* B, int ldb, float* C, int ldc, int k, float alpha, float beta )
-{
-    __visc__hint(visc::SPIR_TARGET);
-    __visc__attributes(3, A, B, C, 1, C);
-
-    float c[TILE_N];
-    for (int i=0; i < TILE_N; i++)
-	c[i] = 0.0f;
-   
-    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
-    int m = get_group_id(0) * TILE_M + mid;
-
-    int b_base = 0;
-
-    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-	float a; 
-        b_base = get_group_id(1) * TILE_N + i * ldb;
-
-	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-	    a = A[m + (i+j)*lda];
-	    for (int kk = 0; kk < TILE_N; kk++)
-		c[kk] += a * B[b_base + j * ldb + kk];
-
-	}
-    }
-    int t = ldc * get_group_id(1) * TILE_N + m;
-    for (int i = 0; i < TILE_N; i++) {
-	C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
-    }
-/*
-    Will be substituted by this kernel at the llvm level
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    std::cout << errorMessage << " Error!\n";                                  \
+    std::cout << "Line: " << __LINE__ << "\n";                                 \
+    exit(1);                                                                   \
+  }
 
-    // Partial results 
-    floatn cp = (floatn)(0.0f);
+void mysgemmNT(float *A, int lda, float *B, int ldb, float *C, int ldc, int k,
+               float alpha, float beta) {
+  __visc__hint(visc::SPIR_TARGET);
+  __visc__attributes(3, A, B, C, 1, C);
 
-    int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
-    int m = get_group_id(0) * TILE_M + mid;
+  float c[TILE_N];
+  for (int i = 0; i < TILE_N; i++)
+    c[i] = 0.0f;
 
-    int b_base = 0;
+  int mid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+  int m = get_group_id(0) * TILE_M + mid;
 
-    for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-	float a;
-        b_base = get_group_id(1) * TILE_N + i * ldb;
+  int b_base = 0;
 
-	for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-	    a = A[m + (i+j)*lda];
-	    cp += a * vloadn(0, B + b_base + j * ldb);
-	}
-    }
+  for (int i = 0; i < k; i += TILE_TB_HEIGHT) {
+    float a;
+    b_base = get_group_id(1) * TILE_N + i * ldb;
 
-    cp = alpha * cp;
-    float c[TILE_N];
-    c[0] = cp.s0;
-    c[1] = cp.s1;
-    c[2] = cp.s2;
-    c[3] = cp.s3;
-    c[4] = cp.s4;
-    c[5] = cp.s5;
-    c[6] = cp.s6;
-    c[7] = cp.s7;
-
-    int t = ldc * get_group_id(1) * TILE_N + m;
-    for (int i = 0; i < TILE_N; i++) {
-	C[t+i*ldc] = C[t+i*ldc] * beta + c[i];
+    for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+      a = A[m + (i + j) * lda];
+      for (int kk = 0; kk < TILE_N; kk++)
+        c[kk] += a * B[b_base + j * ldb + kk];
     }
-
-*/
+  }
+  int t = ldc * get_group_id(1) * TILE_N + m;
+  for (int i = 0; i < TILE_N; i++) {
+    C[t + i * ldc] = C[t + i * ldc] * beta + alpha * c[i];
+  }
+  /*
+      Will be substituted by this kernel at the llvm level
+
+      // Partial results
+      floatn cp = (floatn)(0.0f);
+
+      int mid = get_local_id(1)*get_local_size(0)+get_local_id(0);
+      int m = get_group_id(0) * TILE_M + mid;
+
+      int b_base = 0;
+
+      for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
+          float a;
+          b_base = get_group_id(1) * TILE_N + i * ldb;
+
+          for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+              a = A[m + (i+j)*lda];
+              cp += a * vloadn(0, B + b_base + j * ldb);
+          }
+      }
+
+      cp = alpha * cp;
+      float c[TILE_N];
+      c[0] = cp.s0;
+      c[1] = cp.s1;
+      c[2] = cp.s2;
+      c[3] = cp.s3;
+      c[4] = cp.s4;
+      c[5] = cp.s5;
+      c[6] = cp.s6;
+      c[7] = cp.s7;
+
+      int t = ldc * get_group_id(1) * TILE_N + m;
+      for (int i = 0; i < TILE_N; i++) {
+          C[t+i*ldc] = C[t+i*ldc] * beta + c[i];
+      }
+
+  */
 }
 
-__attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int n, int k, float alpha, float* A, size_t bytesA, int lda, float* B, size_t bytesB, int ldb, float beta, float* C, size_t bytesC, int ldc )
-{
-    if ((transa != 'N') && (transa != 'n')) {
-        std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
-        return;
-    }
+__attribute__((noinline)) void basicSgemm(char transa, char transb, int m,
+                                          int n, int k, float alpha, float *A,
+                                          size_t bytesA, int lda, float *B,
+                                          size_t bytesB, int ldb, float beta,
+                                          float *C, size_t bytesC, int ldc) {
+  if ((transa != 'N') && (transa != 'n')) {
+    std::cerr << "unsupported value of 'transa' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-    if ((transb != 'T') && (transb != 't')) {
-        std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
-        return;
-    }
+  if ((transb != 'T') && (transb != 't')) {
+    std::cerr << "unsupported value of 'transb' in regtileSgemm()" << std::endl;
+    return;
+  }
 
-    // In this code we assume the matrix sizes are multiple of tile size
-  if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
-        return;
-    }
+  // In this code we assume the matrix sizes are multiple of tile size
+  if ((m % TILE_M) || (n % TILE_N)) {
+    std::cerr << "unsupported size of matrix. m should be multiple of "
+              << TILE_M << "; n should be multiple of " << TILE_N << std::endl;
+    return;
+  }
 
-//    unsigned db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
-//    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
-    unsigned db[2] = {TILE_N,TILE_TB_HEIGHT};
-    unsigned dg[2] = {m*TILE_N/TILE_M,n*TILE_TB_HEIGHT/TILE_N};
+  //    unsigned db[2] = {TILE_SZ/VEC_SZ,TILE_SZ};
+  //    unsigned dg[2] = {m/TILE_SZ*db[0],n/TILE_SZ*db[1]};
+  unsigned db[2] = {TILE_N, TILE_TB_HEIGHT};
+  unsigned dg[2] = {m * TILE_N / TILE_M, n * TILE_TB_HEIGHT / TILE_N};
 
-    void* sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
-    __visc__wait(sgemmDFG);
+  void *sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0] / db[0],
+                                dg[1] / db[1], 12, A, bytesA, lda, B, bytesB,
+                                ldb, C, bytesC, ldc, k, alpha, beta, 0);
+  __visc__wait(sgemmDFG);
 }
 
-int main (int argc, char *argv[]) {
-
-    struct pb_Parameters *params;
-    struct pb_TimerSet timers;
-
-    size_t A_sz, B_sz, C_sz;
-    int matArow, matAcol;
-    int matBrow, matBcol;
-    std::vector<float> matA, matBT;
+int main(int argc, char *argv[]) {
 
+  struct pb_Parameters *params;
+  struct pb_TimerSet timers;
 
-    /* Read command line. Expect 3 inputs: A, B and B^T
-       in column-major layout*/
-    params = pb_ReadParameters(&argc, argv);
-    if ((params->inpFiles[0] == NULL)
-            || (params->inpFiles[1] == NULL)
-            || (params->inpFiles[2] == NULL)
-            || (params->inpFiles[3] != NULL))
-    {
-        fprintf(stderr, "Expecting three input filenames\n");
-        exit(-1);
-    }
+  size_t A_sz, B_sz, C_sz;
+  int matArow, matAcol;
+  int matBrow, matBcol;
+  std::vector<float> matA, matBT;
 
-    /* Read in data */
-    // load A
-    readColMajorMatrixFile(params->inpFiles[0],
-                           matArow, matAcol, matA);
+  /* Read command line. Expect 3 inputs: A, B and B^T
+     in column-major layout*/
+  params = pb_ReadParameters(&argc, argv);
+  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] == NULL) ||
+      (params->inpFiles[2] == NULL) || (params->inpFiles[3] != NULL)) {
+    fprintf(stderr, "Expecting three input filenames\n");
+    exit(-1);
+  }
 
-    // load B^T
-    readColMajorMatrixFile(params->inpFiles[2],
-                           matBcol, matBrow, matBT);
+  /* Read in data */
+  // load A
+  readColMajorMatrixFile(params->inpFiles[0], matArow, matAcol, matA);
 
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
+  // load B^T
+  readColMajorMatrixFile(params->inpFiles[2], matBcol, matBrow, matBT);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-    // copy A to device memory
-    A_sz = matArow*matAcol*sizeof(float);
-    B_sz = matBrow*matBcol*sizeof(float);
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
 
-    // allocate space for C
-    C_sz = matArow*matBcol*sizeof(float);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // copy A to device memory
+  A_sz = matArow * matAcol * sizeof(float);
+  B_sz = matBrow * matBcol * sizeof(float);
 
-    // OpenCL memory allocation
-    std::vector<float> matC(matArow*matBcol);
+  // allocate space for C
+  C_sz = matArow * matBcol * sizeof(float);
 
-    pb_SwitchToTimer( &timers, visc_TimerID_MEM_TRACK );
-    llvm_visc_track_mem(&matA.front(), A_sz);
-    llvm_visc_track_mem(&matBT.front(), B_sz);
-    llvm_visc_track_mem(&matC.front(), C_sz);
+  // OpenCL memory allocation
+  std::vector<float> matC(matArow * matBcol);
 
-    // Copy A and B^T into device memory
-    pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
+  llvm_visc_track_mem(&matA.front(), A_sz);
+  llvm_visc_track_mem(&matBT.front(), B_sz);
+  llvm_visc_track_mem(&matC.front(), C_sz);
 
-    for(size_t i=0; i<matC.size(); i++)
-        matC[i] = 0.0f;
+  // Copy A and B^T into device memory
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_NONE );
+  for (size_t i = 0; i < matC.size(); i++)
+    matC[i] = 0.0f;
 
-    // Use standard sgemm interface
-    basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, \
-               &matA.front(), A_sz, matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz, matArow);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-    llvm_visc_request_mem(&matC.front(), C_sz);
+  // Use standard sgemm interface
+  basicSgemm('N', 'T', matArow, matBcol, matAcol, 1.0f, &matA.front(), A_sz,
+             matArow, &matBT.front(), B_sz, matBcol, 0.0f, &matC.front(), C_sz,
+             matArow);
 
-    pb_SwitchToTimer( &timers, visc_TimerID_MEM_UNTRACK );
-    llvm_visc_untrack_mem(&matA.front());
-    llvm_visc_untrack_mem(&matBT.front());
-    llvm_visc_untrack_mem(&matC.front());
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(&matC.front(), C_sz);
 
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  llvm_visc_untrack_mem(&matA.front());
+  llvm_visc_untrack_mem(&matBT.front());
+  llvm_visc_untrack_mem(&matC.front());
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
 
-    if (params->outFile) {
+  if (params->outFile) {
 
-        /* Write C to file */
-        //pb_SwitchToTimer(&timers, pb_TimerID_IO);
-        writeColMajorMatrixFile(params->outFile,
-                                matArow, matBcol, matC);
-    }
+    /* Write C to file */
+    // pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    writeColMajorMatrixFile(params->outFile, matArow, matBcol, matC);
+  }
 
-    double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
-    std::cout<< "GFLOPs = " << 2.* matArow * matBcol * matAcol/GPUtime/1e9 << std::endl;
-    pb_FreeParameters(params);
+  double GPUtime = pb_GetElapsedTime(&(timers.timers[pb_TimerID_KERNEL]));
+  std::cout << "GFLOPs = " << 2. * matArow * matBcol * matAcol / GPUtime / 1e9
+            << std::endl;
+  pb_FreeParameters(params);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c
index 7e7672e8079edc6c40b77933b80a83b1fd4c71c3..bf4c1fe9553fd5399c076d91d0bc758734bc2d01 100644
--- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c
+++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.c
@@ -1,46 +1,44 @@
 /*
-*   NOTES:
-*
-*   1) Matrix Market files are always 1-based, i.e. the index of the first
-*      element of a matrix is (1,1), not (0,0) as in C.  ADJUST THESE
-*      OFFSETS ACCORDINGLY when reading and writing 
-*      to files.
-*
-*   2) ANSI C requires one to use the "l" format modifier when reading
-*      double precision floating point numbers in scanf() and
-*      its variants.  For example, use "%lf", "%lg", or "%le"
-*      when reading doubles, otherwise errors will occur.
-*/
+ *   NOTES:
+ *
+ *   1) Matrix Market files are always 1-based, i.e. the index of the first
+ *      element of a matrix is (1,1), not (0,0) as in C.  ADJUST THESE
+ *      OFFSETS ACCORDINGLY when reading and writing
+ *      to files.
+ *
+ *   2) ANSI C requires one to use the "l" format modifier when reading
+ *      double precision floating point numbers in scanf() and
+ *      its variants.  For example, use "%lf", "%lg", or "%le"
+ *      when reading doubles, otherwise errors will occur.
+ */
 
+#include "convert_dataset.h"
+#include "mmio.h"
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
-#include "mmio.h"
-#include "convert_dataset.h"
-
-
 
 typedef struct _mat_entry {
-    int row, col; /* i,j */
-    float val;
+  int row, col; /* i,j */
+  float val;
 } mat_entry;
 
 typedef struct _row_stats { // stats on each row
-    int index;
-    int size;
-    int start;
-    int padding;
+  int index;
+  int size;
+  int start;
+  int padding;
 } row_stats;
 
-int sort_rows(const void* a, const void* b) {
-    return (((mat_entry*)a)->row - ((mat_entry*)b)->row);
+int sort_rows(const void *a, const void *b) {
+  return (((mat_entry *)a)->row - ((mat_entry *)b)->row);
 }
-int sort_cols(const void* a, const void* b) {
-    return (((mat_entry*)a)->col - ((mat_entry*)b)->col);
+int sort_cols(const void *a, const void *b) {
+  return (((mat_entry *)a)->col - ((mat_entry *)b)->col);
 }
 /* sorts largest by size first */
-int sort_stats(const void* a, const void* b) {
-    return(((row_stats*)b)->size - ((row_stats*)a)->size);
+int sort_stats(const void *a, const void *b) {
+  return (((row_stats *)b)->size - ((row_stats *)a)->size);
 }
 
 /*
@@ -75,262 +73,279 @@ int sort_stats(const void* a, const void* b) {
  *   dim - dimensions of the input matrix
  *   data_ptr_len - size of data_row_ptr (maps to original `depth` var)
  */
-int coo_to_jds(char* mtx_filename, int pad_rows, int warp_size, int pack_size,
-	       int mirrored, int binary, int debug_level,
-               float** data, int** data_row_ptr, int** nz_count, int** data_col_index,
-               int** data_row_map, int* data_cols, int* dim, int* len, int* nz_count_len,
-	       int* data_ptr_len) {
-    int ret_code;
-    MM_typecode matcode;
-    FILE *f;
-    int nz;   
-    int i;
-    float *val;
-    mat_entry* entries;
-    row_stats* stats;
-    int rows, cols;
-    
-    if ((f = fopen(mtx_filename, "r")) == NULL) 
-        exit(1);
+int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size,
+               int mirrored, int binary, int debug_level, float **data,
+               int **data_row_ptr, int **nz_count, int **data_col_index,
+               int **data_row_map, int *data_cols, int *dim, int *len,
+               int *nz_count_len, int *data_ptr_len) {
+  int ret_code;
+  MM_typecode matcode;
+  FILE *f;
+  int nz;
+  int i;
+  float *val;
+  mat_entry *entries;
+  row_stats *stats;
+  int rows, cols;
 
+  if ((f = fopen(mtx_filename, "r")) == NULL)
+    exit(1);
 
-    if (mm_read_banner(f, &matcode) != 0)
-    {
-        printf("Could not process Matrix Market banner.\n");
-        exit(1);
-    }
+  if (mm_read_banner(f, &matcode) != 0) {
+    printf("Could not process Matrix Market banner.\n");
+    exit(1);
+  }
 
+  /*  This is how one can screen matrix types if their application */
+  /*  only supports a subset of the Matrix Market data types.      */
 
-    /*  This is how one can screen matrix types if their application */
-    /*  only supports a subset of the Matrix Market data types.      */
+  if (mm_is_complex(matcode) && mm_is_matrix(matcode) &&
+      mm_is_sparse(matcode)) {
+    printf("Sorry, this application does not support ");
+    printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));
+    exit(1);
+  }
 
-    if (mm_is_complex(matcode) && mm_is_matrix(matcode) && 
-            mm_is_sparse(matcode) )
-    {
-        printf("Sorry, this application does not support ");
-        printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode));
-        exit(1);
-    }
+  /* find out size of sparse matrix .... */
 
-    /* find out size of sparse matrix .... */
+  if ((ret_code = mm_read_mtx_crd_size(f, &rows, &cols, &nz)) != 0)
+    exit(1);
+  *dim = rows;
 
-    if ((ret_code = mm_read_mtx_crd_size(f, &rows, &cols, &nz)) !=0)
-        exit(1);
-    *dim = rows;
-    
-    if (mirrored) {
-	// max possible size, might be less because diagonal values aren't doubled
-	entries = (mat_entry*) malloc(2 * nz * sizeof(mat_entry));
-    } else {
-	entries = (mat_entry*) malloc(nz * sizeof(mat_entry));
-    }
-    
-    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
-    int cur_i=0; // to account for mirrored diagonal entries
+  if (mirrored) {
+    // max possible size, might be less because diagonal values aren't doubled
+    entries = (mat_entry *)malloc(2 * nz * sizeof(mat_entry));
+  } else {
+    entries = (mat_entry *)malloc(nz * sizeof(mat_entry));
+  }
 
-    for (i=0; i<nz; i++, cur_i++)
-    {
-	if (!binary) {
-	    fscanf(f, "%d %d %f\n", &entries[cur_i].row, &entries[cur_i].col, &entries[cur_i].val);
-	} else {
-	    fscanf(f, "%d %d\n", &entries[cur_i].row, &entries[cur_i].col);
-	    entries[cur_i].val = 1.0;
-	}
-        entries[cur_i].row--;
-        entries[cur_i].col--;
-	//printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col, entries[cur_i].val);
-	if (mirrored) {
-	    // fill in mirrored diagonal
-	    if (entries[cur_i].row != entries[cur_i].col) { // not a diagonal value
-		cur_i++;
-		entries[cur_i].val = entries[cur_i-1].val;
-		entries[cur_i].col = entries[cur_i-1].row;
-		entries[cur_i].row = entries[cur_i-1].col;
-		//printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col, entries[cur_i].val);
-	    }
-	}
+  /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+  /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+  /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+  int cur_i = 0; // to account for mirrored diagonal entries
+
+  for (i = 0; i < nz; i++, cur_i++) {
+    if (!binary) {
+      fscanf(f, "%d %d %f\n", &entries[cur_i].row, &entries[cur_i].col,
+             &entries[cur_i].val);
+    } else {
+      fscanf(f, "%d %d\n", &entries[cur_i].row, &entries[cur_i].col);
+      entries[cur_i].val = 1.0;
     }
-    // set new non-zero count
-    nz = cur_i;
-    if (debug_level >= 1) {
-	printf("Converting COO to JDS format (%dx%d)\n%d matrix entries, warp size = %d, "
-	       "row padding align = %d, pack size = %d\n\n", rows, cols, nz, warp_size, pad_rows, pack_size);
+    entries[cur_i].row--;
+    entries[cur_i].col--;
+    // printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col,
+    // entries[cur_i].val);
+    if (mirrored) {
+      // fill in mirrored diagonal
+      if (entries[cur_i].row != entries[cur_i].col) { // not a diagonal value
+        cur_i++;
+        entries[cur_i].val = entries[cur_i - 1].val;
+        entries[cur_i].col = entries[cur_i - 1].row;
+        entries[cur_i].row = entries[cur_i - 1].col;
+        // printf("%d,%d = %f\n", entries[cur_i].row, entries[cur_i].col,
+        // entries[cur_i].val);
+      }
     }
-    if (f !=stdin) fclose(f);
+  }
+  // set new non-zero count
+  nz = cur_i;
+  if (debug_level >= 1) {
+    printf("Converting COO to JDS format (%dx%d)\n%d matrix entries, warp size "
+           "= %d, "
+           "row padding align = %d, pack size = %d\n\n",
+           rows, cols, nz, warp_size, pad_rows, pack_size);
+  }
+  if (f != stdin)
+    fclose(f);
 
-    /*
-     * Now we have an array of values in entries
-     * Transform to padded JDS format  - sort by rows, then fubini
-     */
+  /*
+   * Now we have an array of values in entries
+   * Transform to padded JDS format  - sort by rows, then fubini
+   */
 
-    int irow, icol=0, istart=0;
-    int total_size=0;
+  int irow, icol = 0, istart = 0;
+  int total_size = 0;
 
-    /* Loop through each entry to figure out padding, grouping that determine
-     * final data array size
-     *
-     * First calculate stats for each row
-     * 
-     * Collect stats using the major_stats typedef
-     */
-    
-    
-    qsort(entries, nz, sizeof(mat_entry), sort_rows); // sort by row number
-    rows = entries[nz-1].row+1; // last item is greatest row (zero indexed)
-    if (rows%warp_size) { // pad group number to warp_size here
-	rows += warp_size - rows%warp_size;
-    }
-    stats = (row_stats*) calloc(rows, sizeof(row_stats)); // set to 0
-    *data_row_map = (int*) calloc(rows, sizeof(int));
-    irow = entries[0].row; // set first row
-    
-    //printf("First row %d\n", irow);
-    for (i=0; i<nz; i++) { // loop through each sorted entry
-	if (entries[i].row != irow || i == nz-1) { // new row
-	    //printf("%d != %d\n", entries[i].row, irow);
-	    if (i == nz-1) {
-		// last item, add it to current row
-		//printf("Last item i=%d, row=%d, irow=%d\n", i, entries[i].row, irow);
-		icol++;
-	    }
-	    // hit a new row, record stats for the last row (i-1)
-	    stats[irow].size = icol; // record # cols in previous row
-	    stats[irow].index = entries[i-1].row; // row # for previous stat item
-	    //printf("Row %d, i=%d, irow=%d\n", entries[i].row, i, irow);
-	    stats[irow].start = istart; // starting location in entries array
-	    // set stats for the next row until this break again
-	    icol=0; // reset row items
-	    irow = entries[i].row;
-	    istart = i;
-	}
-	icol++; // keep track of number of items in this row
+  /* Loop through each entry to figure out padding, grouping that determine
+   * final data array size
+   *
+   * First calculate stats for each row
+   *
+   * Collect stats using the major_stats typedef
+   */
+
+  qsort(entries, nz, sizeof(mat_entry), sort_rows); // sort by row number
+  rows = entries[nz - 1].row + 1; // last item is greatest row (zero indexed)
+  if (rows % warp_size) {         // pad group number to warp_size here
+    rows += warp_size - rows % warp_size;
+  }
+  stats = (row_stats *)calloc(rows, sizeof(row_stats)); // set to 0
+  *data_row_map = (int *)calloc(rows, sizeof(int));
+  irow = entries[0].row; // set first row
+
+  // printf("First row %d\n", irow);
+  for (i = 0; i < nz; i++) { // loop through each sorted entry
+    if (entries[i].row != irow || i == nz - 1) { // new row
+      // printf("%d != %d\n", entries[i].row, irow);
+      if (i == nz - 1) {
+        // last item, add it to current row
+        // printf("Last item i=%d, row=%d, irow=%d\n", i, entries[i].row, irow);
+        icol++;
+      }
+      // hit a new row, record stats for the last row (i-1)
+      stats[irow].size = icol;                // record # cols in previous row
+      stats[irow].index = entries[i - 1].row; // row # for previous stat item
+      // printf("Row %d, i=%d, irow=%d\n", entries[i].row, i, irow);
+      stats[irow].start = istart; // starting location in entries array
+      // set stats for the next row until this break again
+      icol = 0; // reset row items
+      irow = entries[i].row;
+      istart = i;
     }
-    
-    
-    *nz_count_len = rows/warp_size + rows%warp_size;
-    *nz_count = (int*) malloc(*nz_count_len * sizeof(int)); // only one value per group
-    
-    /* sort based upon row size, greatest first */
-    qsort(stats, rows, sizeof(row_stats), sort_stats);
-    /* figure out padding and grouping */
-    if (debug_level >= 1) {
-	printf("Padding data....%d rows, %d groups\n", rows, *nz_count_len);
+    icol++; // keep track of number of items in this row
+  }
+
+  *nz_count_len = rows / warp_size + rows % warp_size;
+  *nz_count =
+      (int *)malloc(*nz_count_len * sizeof(int)); // only one value per group
+
+  /* sort based upon row size, greatest first */
+  qsort(stats, rows, sizeof(row_stats), sort_stats);
+  /* figure out padding and grouping */
+  if (debug_level >= 1) {
+    printf("Padding data....%d rows, %d groups\n", rows, *nz_count_len);
+  }
+  int pad_to, total_padding = 0, pack_to;
+  pad_rows *= pack_size; // change padding to account for packed items
+  for (i = 0; i < rows; i++) {
+    // record JDS to real row number
+    (*data_row_map)[i] = stats[i].index;
+    if (i < rows - 1) {
+      // (*data_row_map)[i]--; // ???? no idea why this is off by 1
     }
-    int pad_to, total_padding = 0, pack_to;
-    pad_rows *= pack_size; // change padding to account for packed items
-    for (i=0; i<rows; i++) {
-	// record JDS to real row number
-	(*data_row_map)[i] = stats[i].index;
-	if (i<rows-1) {
-	   // (*data_row_map)[i]--; // ???? no idea why this is off by 1
-	}
-	// each row is padded so the number of packed groups % pad_rows == 0
-	if (i % warp_size == 0) { // on a group boundary with the largest number of items
-	    // find padding in individual items
-	    if (stats[i].size % pad_rows) {
-		stats[i].padding = pad_rows - (stats[i].size % pad_rows); // find padding
-	    } else {
-		stats[i].padding = 0; // no padding necessary, already at pad multiple
-	    }
-	    if (stats[i].size % pack_size) {
-		pack_to = ceil((float)stats[i].size/pack_size);
-	    } else {
-		pack_to = stats[i].size/pack_size;
-	    }
-	    //pack_to = stats[i].size + (!stats[i].size%pack_size) ? 0 : (pack_size - stats[i].size%pack_size);
-	    pad_to = stats[i].size + stats[i].padding; // total size of this row, with padding
-	    // TODO: change this to reflect the real number of nonzero packed items, not the padded
-	    // value
-	    (*nz_count)[i/warp_size] = pack_to; // number of packed items in this group
-	    total_size += pad_to * warp_size; // allocate size for this padded group
-	    if (debug_level >= 2)
-		printf("Padding warp group %d to %d items, zn = %d\n", i/warp_size, pad_to, pack_to);
-	} else {
-	    stats[i].padding = pad_to - stats[i].size;
-	}
-	total_padding += stats[i].padding;
-	//if (debug_level >= 2)
-	//    printf("Row %d, %d items, %d padding\n", stats[i].index, stats[i].size, stats[i].padding);
+    // each row is padded so the number of packed groups % pad_rows == 0
+    if (i % warp_size ==
+        0) { // on a group boundary with the largest number of items
+      // find padding in individual items
+      if (stats[i].size % pad_rows) {
+        stats[i].padding =
+            pad_rows - (stats[i].size % pad_rows); // find padding
+      } else {
+        stats[i].padding = 0; // no padding necessary, already at pad multiple
+      }
+      if (stats[i].size % pack_size) {
+        pack_to = ceil((float)stats[i].size / pack_size);
+      } else {
+        pack_to = stats[i].size / pack_size;
+      }
+      // pack_to = stats[i].size + (!stats[i].size%pack_size) ? 0 : (pack_size -
+      // stats[i].size%pack_size);
+      pad_to = stats[i].size +
+               stats[i].padding; // total size of this row, with padding
+      // TODO: change this to reflect the real number of nonzero packed items,
+      // not the padded value
+      (*nz_count)[i / warp_size] =
+          pack_to;                      // number of packed items in this group
+      total_size += pad_to * warp_size; // allocate size for this padded group
+      if (debug_level >= 2)
+        printf("Padding warp group %d to %d items, zn = %d\n", i / warp_size,
+               pad_to, pack_to);
+    } else {
+      stats[i].padding = pad_to - stats[i].size;
     }
-    
-    /* allocate data and data_row_index */
-    if (debug_level >= 1)
-	printf("Allocating data space: %d entries (%f%% padding)\n", total_size, (float)100*total_padding/total_size);
-    *data = (float*) calloc(total_size, sizeof(float)); // set to 0 so padded values are set
-    *data_col_index = (int*) calloc(total_size, sizeof(int)); // any unset indexes point to 0
-    *data_row_ptr = (int*) calloc(rows, sizeof(int));
-    *len = total_size;
-    i = 0; // data index, including padding
-    
-    /*
-     * Keep looping through each row, writing data a group at a time
-     * to the output array. Increment `irow` each time, and use it as
-     * an index into entries along with stats.start to get the next
-     * data item
-     */
-    irow = 0; // keep track of which row we are in inside the fubini-ed array
-    int idata = 0; // position within final data array
-    int entry_index, j;
-    int ipack; // used in internal loop for writing packed values
-    mat_entry entry;
-    while (1) {
-	/* record data_row_ptr */
-	(*data_row_ptr)[irow] = idata;
-	
-	/* End condtion: the size of the greatest row is smaller than the current
-	  Fubini-ed row */
-	if (stats[0].size+stats[0].padding <= irow*pack_size) break;
+    total_padding += stats[i].padding;
+    // if (debug_level >= 2)
+    //    printf("Row %d, %d items, %d padding\n", stats[i].index,
+    //    stats[i].size, stats[i].padding);
+  }
+
+  /* allocate data and data_row_index */
+  if (debug_level >= 1)
+    printf("Allocating data space: %d entries (%f%% padding)\n", total_size,
+           (float)100 * total_padding / total_size);
+  *data = (float *)calloc(total_size,
+                          sizeof(float)); // set to 0 so padded values are set
+  *data_col_index =
+      (int *)calloc(total_size, sizeof(int)); // any unset indexes point to 0
+  *data_row_ptr = (int *)calloc(rows, sizeof(int));
+  *len = total_size;
+  i = 0; // data index, including padding
+
+  /*
+   * Keep looping through each row, writing data a group at a time
+   * to the output array. Increment `irow` each time, and use it as
+   * an index into entries along with stats.start to get the next
+   * data item
+   */
+  irow = 0;      // keep track of which row we are in inside the fubini-ed array
+  int idata = 0; // position within final data array
+  int entry_index, j;
+  int ipack; // used in internal loop for writing packed values
+  mat_entry entry;
+  while (1) {
+    /* record data_row_ptr */
+    (*data_row_ptr)[irow] = idata;
+
+    /* End condtion: the size of the greatest row is smaller than the current
+      Fubini-ed row */
+    if (stats[0].size + stats[0].padding <= irow * pack_size)
+      break;
 
-	//printf("Data row pointer for row %d is %d\n", irow, idata);
-	for (i=0; i<rows; i++) {
-	    /* take one packed group from each original row */
-	    //printf("Output irow %d icol %d (real %d,%d size %d)\n", irow, i, entry.col, i, stats[i].size);
-	    /* Watch out for little vs big endian, and how opencl interprets vector casting from pointers */
-	    for (ipack=0; ipack<pack_size; ipack++) {
-		if (stats[i].size > irow*pack_size+ipack) {
-		    // copy value
-		    entry_index = stats[i].start + irow*pack_size+ipack;
-		    entry = entries[entry_index];
-		    /* record index and value */
-		    (*data)[idata] = entry.val;
-		    /* each data item will get its row index from the thread, col from here */
-		    (*data_col_index)[idata] = entry.col;
+    // printf("Data row pointer for row %d is %d\n", irow, idata);
+    for (i = 0; i < rows; i++) {
+      /* take one packed group from each original row */
+      // printf("Output irow %d icol %d (real %d,%d size %d)\n", irow, i,
+      // entry.col, i, stats[i].size);
+      /* Watch out for little vs big endian, and how opencl interprets vector
+       * casting from pointers */
+      for (ipack = 0; ipack < pack_size; ipack++) {
+        if (stats[i].size > irow * pack_size + ipack) {
+          // copy value
+          entry_index = stats[i].start + irow * pack_size + ipack;
+          entry = entries[entry_index];
+          /* record index and value */
+          (*data)[idata] = entry.val;
+          /* each data item will get its row index from the thread, col from
+           * here */
+          (*data_col_index)[idata] = entry.col;
 
-		    if (debug_level >= 2) {
-			if (i < 3) {
-			    // first row debugging
-			    printf("[%d row%d=%.3f]", ipack+1, i, entry.val);
-			} else {
-			    printf("%d", ipack+1);
-			}
-		    }
-		} else if (stats[i].size+stats[i].padding > irow*pack_size+ipack) {
-		    /* add padding to the end of each row here - this assumes padding is factored into allocated size */
-		    if (debug_level >= 2) printf("0");
-		    (*data_col_index)[idata] = -1;
-		} else {
-		    goto endwrite; // no data written this pass, so don't increment idata
-		}
-		idata += 1;
-	    }
-	}
-	endwrite:
-	if (debug_level >= 2) {
-	    printf("\n");
-	}
-	irow += 1;
+          if (debug_level >= 2) {
+            if (i < 3) {
+              // first row debugging
+              printf("[%d row%d=%.3f]", ipack + 1, i, entry.val);
+            } else {
+              printf("%d", ipack + 1);
+            }
+          }
+        } else if (stats[i].size + stats[i].padding >
+                   irow * pack_size + ipack) {
+          /* add padding to the end of each row here - this assumes padding is
+           * factored into allocated size */
+          if (debug_level >= 2)
+            printf("0");
+          (*data_col_index)[idata] = -1;
+        } else {
+          goto endwrite; // no data written this pass, so don't increment idata
+        }
+        idata += 1;
+      }
     }
-    
-    if (debug_level >= 1)
-	printf("Finished converting.\nJDS format has %d columns, %d rows.\n", rows, irow);
-    free(entries);
-    free(stats);
-    printf("nz_count_len = %d\n", *nz_count_len);
-    
-    *data_cols = rows;
-    *data_ptr_len = irow+1;
-    return 0;
-}
+  endwrite:
+    if (debug_level >= 2) {
+      printf("\n");
+    }
+    irow += 1;
+  }
 
+  if (debug_level >= 1)
+    printf("Finished converting.\nJDS format has %d columns, %d rows.\n", rows,
+           irow);
+  free(entries);
+  free(stats);
+  printf("nz_count_len = %d\n", *nz_count_len);
+
+  *data_cols = rows;
+  *data_ptr_len = irow + 1;
+  return 0;
+}
diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h
index a495ffa68821594189f8de61a1e6a74536cd31b9..6713a9ed3e8e37a7089694c36bb81aead4c61122 100644
--- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h
+++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/convert_dataset.h
@@ -4,11 +4,11 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-int coo_to_jds(char* mtx_filename, int pad_rows, int warp_size, int pack_size,
-	       int mirrored, int binary, int debug_level,
-               float** data, int** data_row_ptr, int** nz_count, int** data_col_index,
-               int** data_row_map, int* data_cols, int* dim, int* len, int* nz_count_len,
-	       int* data_ptr_len);
+int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size,
+               int mirrored, int binary, int debug_level, float **data,
+               int **data_row_ptr, int **nz_count, int **data_col_index,
+               int **data_row_map, int *data_cols, int *dim, int *len,
+               int *nz_count_len, int *data_ptr_len);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c
index c250ff2aed998fe65248537a8b19a359206187ce..1429b087c20888102f13d9296cbdacc108965e27 100644
--- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c
+++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.c
@@ -1,261 +1,234 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
-
+/*
+ *   Matrix Market I/O library for ANSI C
+ *
+ *   See http://math.nist.gov/MatrixMarket for details.
+ *
+ *
+ */
 
+#include <ctype.h>
 #include <stdio.h>
-#include <string.h>
 #include <stdlib.h>
-#include <ctype.h>
+#include <string.h>
 
 #include "mmio.h"
 
 int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_)
-{
-    FILE *f;
-    MM_typecode matcode;
-    int M, N, nz;
-    int i;
-    double *val;
-    int *I, *J;
- 
-    if ((f = fopen(fname, "r")) == NULL)
-            return -1;
- 
- 
-    if (mm_read_banner(f, &matcode) != 0)
-    {
-        printf("mm_read_unsymetric: Could not process Matrix Market banner ");
-        printf(" in file [%s]\n", fname);
-        return -1;
-    }
- 
- 
- 
-    if ( !(mm_is_real(matcode) && mm_is_matrix(matcode) &&
-            mm_is_sparse(matcode)))
-    {
-        fprintf(stderr, "Sorry, this application does not support ");
-        fprintf(stderr, "Market Market type: [%s]\n",
-                mm_typecode_to_str(matcode));
-        return -1;
-    }
- 
-    /* find out size of sparse matrix: M, N, nz .... */
- 
-    if (mm_read_mtx_crd_size(f, &M, &N, &nz) !=0)
-    {
-        fprintf(stderr, "read_unsymmetric_sparse(): could not parse matrix size.\n");
-        return -1;
-    }
- 
-    *M_ = M;
-    *N_ = N;
-    *nz_ = nz;
- 
-    /* reseve memory for matrices */
- 
-    I = (int *) malloc(nz * sizeof(int));
-    J = (int *) malloc(nz * sizeof(int));
-    val = (double *) malloc(nz * sizeof(double));
- 
-    *val_ = val;
-    *I_ = I;
-    *J_ = J;
- 
-    /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
-    /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
-    /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
- 
-    for (i=0; i<nz; i++)
-    {
-        fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
-        I[i]--;  /* adjust from 1-based to 0-based */
-        J[i]--;
-    }
-    fclose(f);
- 
+                               double **val_, int **I_, int **J_) {
+  FILE *f;
+  MM_typecode matcode;
+  int M, N, nz;
+  int i;
+  double *val;
+  int *I, *J;
+
+  if ((f = fopen(fname, "r")) == NULL)
+    return -1;
+
+  if (mm_read_banner(f, &matcode) != 0) {
+    printf("mm_read_unsymetric: Could not process Matrix Market banner ");
+    printf(" in file [%s]\n", fname);
+    return -1;
+  }
+
+  if (!(mm_is_real(matcode) && mm_is_matrix(matcode) &&
+        mm_is_sparse(matcode))) {
+    fprintf(stderr, "Sorry, this application does not support ");
+    fprintf(stderr, "Market Market type: [%s]\n", mm_typecode_to_str(matcode));
+    return -1;
+  }
+
+  /* find out size of sparse matrix: M, N, nz .... */
+
+  if (mm_read_mtx_crd_size(f, &M, &N, &nz) != 0) {
+    fprintf(stderr,
+            "read_unsymmetric_sparse(): could not parse matrix size.\n");
+    return -1;
+  }
+
+  *M_ = M;
+  *N_ = N;
+  *nz_ = nz;
+
+  /* reseve memory for matrices */
+
+  I = (int *)malloc(nz * sizeof(int));
+  J = (int *)malloc(nz * sizeof(int));
+  val = (double *)malloc(nz * sizeof(double));
+
+  *val_ = val;
+  *I_ = I;
+  *J_ = J;
+
+  /* NOTE: when reading in doubles, ANSI C requires the use of the "l"  */
+  /*   specifier as in "%lg", "%lf", "%le", otherwise errors will occur */
+  /*  (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15)            */
+
+  for (i = 0; i < nz; i++) {
+    fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]);
+    I[i]--; /* adjust from 1-based to 0-based */
+    J[i]--;
+  }
+  fclose(f);
+
+  return 0;
+}
+
+int mm_is_valid(MM_typecode matcode) {
+  if (!mm_is_matrix(matcode))
     return 0;
+  if (mm_is_dense(matcode) && mm_is_pattern(matcode))
+    return 0;
+  if (mm_is_real(matcode) && mm_is_hermitian(matcode))
+    return 0;
+  if (mm_is_pattern(matcode) &&
+      (mm_is_hermitian(matcode) || mm_is_skew(matcode)))
+    return 0;
+  return 1;
+}
+
+int mm_read_banner(FILE *f, MM_typecode *matcode) {
+  char line[MM_MAX_LINE_LENGTH];
+  char banner[MM_MAX_TOKEN_LENGTH];
+  char mtx[MM_MAX_TOKEN_LENGTH];
+  char crd[MM_MAX_TOKEN_LENGTH];
+  char data_type[MM_MAX_TOKEN_LENGTH];
+  char storage_scheme[MM_MAX_TOKEN_LENGTH];
+  char *p;
+
+  mm_clear_typecode(matcode);
+
+  if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
+    return MM_PREMATURE_EOF;
+
+  if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type,
+             storage_scheme) != 5)
+    return MM_PREMATURE_EOF;
+
+  for (p = mtx; *p != '\0'; *p = tolower(*p), p++)
+    ; /* convert to lower case */
+  for (p = crd; *p != '\0'; *p = tolower(*p), p++)
+    ;
+  for (p = data_type; *p != '\0'; *p = tolower(*p), p++)
+    ;
+  for (p = storage_scheme; *p != '\0'; *p = tolower(*p), p++)
+    ;
+
+  /* check for banner */
+  if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
+    return MM_NO_HEADER;
+
+  /* first field should be "mtx" */
+  if (strcmp(mtx, MM_MTX_STR) != 0)
+    return MM_UNSUPPORTED_TYPE;
+  mm_set_matrix(matcode);
+
+  /* second field describes whether this is a sparse matrix (in coordinate
+          storgae) or a dense array */
+
+  if (strcmp(crd, MM_SPARSE_STR) == 0)
+    mm_set_sparse(matcode);
+  else if (strcmp(crd, MM_DENSE_STR) == 0)
+    mm_set_dense(matcode);
+  else
+    return MM_UNSUPPORTED_TYPE;
+
+  /* third field */
+
+  if (strcmp(data_type, MM_REAL_STR) == 0)
+    mm_set_real(matcode);
+  else if (strcmp(data_type, MM_COMPLEX_STR) == 0)
+    mm_set_complex(matcode);
+  else if (strcmp(data_type, MM_PATTERN_STR) == 0)
+    mm_set_pattern(matcode);
+  else if (strcmp(data_type, MM_INT_STR) == 0)
+    mm_set_integer(matcode);
+  else
+    return MM_UNSUPPORTED_TYPE;
+
+  /* fourth field */
+
+  if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
+    mm_set_general(matcode);
+  else if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
+    mm_set_symmetric(matcode);
+  else if (strcmp(storage_scheme, MM_HERM_STR) == 0)
+    mm_set_hermitian(matcode);
+  else if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
+    mm_set_skew(matcode);
+  else
+    return MM_UNSUPPORTED_TYPE;
+
+  return 0;
 }
 
-int mm_is_valid(MM_typecode matcode)
-{
-    if (!mm_is_matrix(matcode)) return 0;
-    if (mm_is_dense(matcode) && mm_is_pattern(matcode)) return 0;
-    if (mm_is_real(matcode) && mm_is_hermitian(matcode)) return 0;
-    if (mm_is_pattern(matcode) && (mm_is_hermitian(matcode) || 
-                mm_is_skew(matcode))) return 0;
-    return 1;
+int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz) {
+  if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
+    return MM_COULD_NOT_WRITE_FILE;
+  else
+    return 0;
 }
 
-int mm_read_banner(FILE *f, MM_typecode *matcode)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    char banner[MM_MAX_TOKEN_LENGTH];
-    char mtx[MM_MAX_TOKEN_LENGTH]; 
-    char crd[MM_MAX_TOKEN_LENGTH];
-    char data_type[MM_MAX_TOKEN_LENGTH];
-    char storage_scheme[MM_MAX_TOKEN_LENGTH];
-    char *p;
+int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz) {
+  char line[MM_MAX_LINE_LENGTH];
+  int num_items_read;
 
+  /* set return null parameter values, in case we exit with errors */
+  *M = *N = *nz = 0;
 
-    mm_clear_typecode(matcode);  
+  /* now continue scanning until you reach the end-of-comments */
+  do {
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
+      return MM_PREMATURE_EOF;
+  } while (line[0] == '%');
 
-    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL) 
-        return MM_PREMATURE_EOF;
+  /* line[] is either blank or has M,N, nz */
+  if (sscanf(line, "%d %d %d", M, N, nz) == 3)
+    return 0;
 
-    if (sscanf(line, "%s %s %s %s %s", banner, mtx, crd, data_type, 
-        storage_scheme) != 5)
+  else
+    do {
+      num_items_read = fscanf(f, "%d %d %d", M, N, nz);
+      if (num_items_read == EOF)
         return MM_PREMATURE_EOF;
+    } while (num_items_read != 3);
 
-    for (p=mtx; *p!='\0'; *p=tolower(*p),p++);  /* convert to lower case */
-    for (p=crd; *p!='\0'; *p=tolower(*p),p++);  
-    for (p=data_type; *p!='\0'; *p=tolower(*p),p++);
-    for (p=storage_scheme; *p!='\0'; *p=tolower(*p),p++);
-
-    /* check for banner */
-    if (strncmp(banner, MatrixMarketBanner, strlen(MatrixMarketBanner)) != 0)
-        return MM_NO_HEADER;
-
-    /* first field should be "mtx" */
-    if (strcmp(mtx, MM_MTX_STR) != 0)
-        return  MM_UNSUPPORTED_TYPE;
-    mm_set_matrix(matcode);
-
-
-    /* second field describes whether this is a sparse matrix (in coordinate
-            storgae) or a dense array */
-
-
-    if (strcmp(crd, MM_SPARSE_STR) == 0)
-        mm_set_sparse(matcode);
-    else
-    if (strcmp(crd, MM_DENSE_STR) == 0)
-            mm_set_dense(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* third field */
-
-    if (strcmp(data_type, MM_REAL_STR) == 0)
-        mm_set_real(matcode);
-    else
-    if (strcmp(data_type, MM_COMPLEX_STR) == 0)
-        mm_set_complex(matcode);
-    else
-    if (strcmp(data_type, MM_PATTERN_STR) == 0)
-        mm_set_pattern(matcode);
-    else
-    if (strcmp(data_type, MM_INT_STR) == 0)
-        mm_set_integer(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-    
-
-    /* fourth field */
-
-    if (strcmp(storage_scheme, MM_GENERAL_STR) == 0)
-        mm_set_general(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SYMM_STR) == 0)
-        mm_set_symmetric(matcode);
-    else
-    if (strcmp(storage_scheme, MM_HERM_STR) == 0)
-        mm_set_hermitian(matcode);
-    else
-    if (strcmp(storage_scheme, MM_SKEW_STR) == 0)
-        mm_set_skew(matcode);
-    else
-        return MM_UNSUPPORTED_TYPE;
-        
-
-    return 0;
+  return 0;
 }
 
-int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz)
-{
-    if (fprintf(f, "%d %d %d\n", M, N, nz) != 3)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
-}
+int mm_read_mtx_array_size(FILE *f, int *M, int *N) {
+  char line[MM_MAX_LINE_LENGTH];
+  int num_items_read;
+  /* set return null parameter values, in case we exit with errors */
+  *M = *N = 0;
 
-int mm_read_mtx_crd_size(FILE *f, int *M, int *N, int *nz )
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = *nz = 0;
-
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d %d", M, N, nz) == 3)
-        return 0;
-        
-    else
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d %d", M, N, nz); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 3);
+  /* now continue scanning until you reach the end-of-comments */
+  do {
+    if (fgets(line, MM_MAX_LINE_LENGTH, f) == NULL)
+      return MM_PREMATURE_EOF;
+  } while (line[0] == '%');
 
+  /* line[] is either blank or has M,N, nz */
+  if (sscanf(line, "%d %d", M, N) == 2)
     return 0;
-}
-
 
-int mm_read_mtx_array_size(FILE *f, int *M, int *N)
-{
-    char line[MM_MAX_LINE_LENGTH];
-    int num_items_read;
-    /* set return null parameter values, in case we exit with errors */
-    *M = *N = 0;
-	
-    /* now continue scanning until you reach the end-of-comments */
-    do 
-    {
-        if (fgets(line,MM_MAX_LINE_LENGTH,f) == NULL) 
-            return MM_PREMATURE_EOF;
-    }while (line[0] == '%');
-
-    /* line[] is either blank or has M,N, nz */
-    if (sscanf(line, "%d %d", M, N) == 2)
-        return 0;
-        
-    else /* we have a blank line */
-    do
-    { 
-        num_items_read = fscanf(f, "%d %d", M, N); 
-        if (num_items_read == EOF) return MM_PREMATURE_EOF;
-    }
-    while (num_items_read != 2);
+  else /* we have a blank line */
+    do {
+      num_items_read = fscanf(f, "%d %d", M, N);
+      if (num_items_read == EOF)
+        return MM_PREMATURE_EOF;
+    } while (num_items_read != 2);
 
-    return 0;
+  return 0;
 }
 
-int mm_write_mtx_array_size(FILE *f, int M, int N)
-{
-    if (fprintf(f, "%d %d\n", M, N) != 2)
-        return MM_COULD_NOT_WRITE_FILE;
-    else 
-        return 0;
+int mm_write_mtx_array_size(FILE *f, int M, int N) {
+  if (fprintf(f, "%d %d\n", M, N) != 2)
+    return MM_COULD_NOT_WRITE_FILE;
+  else
+    return 0;
 }
 
-
-
 /*-------------------------------------------------------------------------*/
 
 /******************************************************************/
@@ -263,65 +236,50 @@ int mm_write_mtx_array_size(FILE *f, int M, int N)
 /******************************************************************/
 
 int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    int i;
-    if (mm_is_complex(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2*i], &val[2*i+1])
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-        for (i=0; i<nz; i++)
-        {
-            if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i])
-                != 3) return MM_PREMATURE_EOF;
-
-        }
+                         double val[], MM_typecode matcode) {
+  int i;
+  if (mm_is_complex(matcode)) {
+    for (i = 0; i < nz; i++)
+      if (fscanf(f, "%d %d %lg %lg", &I[i], &J[i], &val[2 * i],
+                 &val[2 * i + 1]) != 4)
+        return MM_PREMATURE_EOF;
+  } else if (mm_is_real(matcode)) {
+    for (i = 0; i < nz; i++) {
+      if (fscanf(f, "%d %d %lg\n", &I[i], &J[i], &val[i]) != 3)
+        return MM_PREMATURE_EOF;
     }
+  }
 
-    else if (mm_is_pattern(matcode))
-    {
-        for (i=0; i<nz; i++)
-            if (fscanf(f, "%d %d", &I[i], &J[i])
-                != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
+  else if (mm_is_pattern(matcode)) {
+    for (i = 0; i < nz; i++)
+      if (fscanf(f, "%d %d", &I[i], &J[i]) != 2)
+        return MM_PREMATURE_EOF;
+  } else
+    return MM_UNSUPPORTED_TYPE;
 
-    return 0;
-        
+  return 0;
 }
 
-int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
-        double *real, double *imag, MM_typecode matcode)
-{
-    if (mm_is_complex(matcode))
-    {
-            if (fscanf(f, "%d %d %lg %lg", I, J, real, imag)
-                != 4) return MM_PREMATURE_EOF;
-    }
-    else if (mm_is_real(matcode))
-    {
-            if (fscanf(f, "%d %d %lg\n", I, J, real)
-                != 3) return MM_PREMATURE_EOF;
+int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *imag,
+                          MM_typecode matcode) {
+  if (mm_is_complex(matcode)) {
+    if (fscanf(f, "%d %d %lg %lg", I, J, real, imag) != 4)
+      return MM_PREMATURE_EOF;
+  } else if (mm_is_real(matcode)) {
+    if (fscanf(f, "%d %d %lg\n", I, J, real) != 3)
+      return MM_PREMATURE_EOF;
 
-    }
+  }
 
-    else if (mm_is_pattern(matcode))
-    {
-            if (fscanf(f, "%d %d", I, J) != 2) return MM_PREMATURE_EOF;
-    }
-    else
-        return MM_UNSUPPORTED_TYPE;
+  else if (mm_is_pattern(matcode)) {
+    if (fscanf(f, "%d %d", I, J) != 2)
+      return MM_PREMATURE_EOF;
+  } else
+    return MM_UNSUPPORTED_TYPE;
 
-    return 0;
-        
+  return 0;
 }
 
-
 /************************************************************************
     mm_read_mtx_crd()  fills M, N, nz, array of values, and return
                         type code, e.g. 'MCRS'
@@ -330,182 +288,160 @@ int mm_read_mtx_crd_entry(FILE *f, int *I, int *J,
                             (nz pairs of real/imaginary values)
 ************************************************************************/
 
-int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J, 
-        double **val, MM_typecode *matcode)
-{
-    int ret_code;
-    FILE *f;
-
-    if (strcmp(fname, "stdin") == 0) f=stdin;
-    else
-    if ((f = fopen(fname, "r")) == NULL)
-        return MM_COULD_NOT_READ_FILE;
-
-
-    if ((ret_code = mm_read_banner(f, matcode)) != 0)
-        return ret_code;
-
-    if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) && 
-            mm_is_matrix(*matcode)))
-        return MM_UNSUPPORTED_TYPE;
-
-    if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
-        return ret_code;
-
-
-    *I = (int *)  malloc(*nz * sizeof(int));
-    *J = (int *)  malloc(*nz * sizeof(int));
-    *val = NULL;
-
-    if (mm_is_complex(*matcode))
-    {
-        *val = (double *) malloc(*nz * 2 * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
-    else if (mm_is_real(*matcode))
-    {
-        *val = (double *) malloc(*nz * sizeof(double));
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
+int mm_read_mtx_crd(char *fname, int *M, int *N, int *nz, int **I, int **J,
+                    double **val, MM_typecode *matcode) {
+  int ret_code;
+  FILE *f;
+
+  if (strcmp(fname, "stdin") == 0)
+    f = stdin;
+  else if ((f = fopen(fname, "r")) == NULL)
+    return MM_COULD_NOT_READ_FILE;
+
+  if ((ret_code = mm_read_banner(f, matcode)) != 0)
+    return ret_code;
+
+  if (!(mm_is_valid(*matcode) && mm_is_sparse(*matcode) &&
+        mm_is_matrix(*matcode)))
+    return MM_UNSUPPORTED_TYPE;
+
+  if ((ret_code = mm_read_mtx_crd_size(f, M, N, nz)) != 0)
+    return ret_code;
+
+  *I = (int *)malloc(*nz * sizeof(int));
+  *J = (int *)malloc(*nz * sizeof(int));
+  *val = NULL;
+
+  if (mm_is_complex(*matcode)) {
+    *val = (double *)malloc(*nz * 2 * sizeof(double));
+    ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, *matcode);
+    if (ret_code != 0)
+      return ret_code;
+  } else if (mm_is_real(*matcode)) {
+    *val = (double *)malloc(*nz * sizeof(double));
+    ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, *matcode);
+    if (ret_code != 0)
+      return ret_code;
+  }
+
+  else if (mm_is_pattern(*matcode)) {
+    ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, *matcode);
+    if (ret_code != 0)
+      return ret_code;
+  }
+
+  if (f != stdin)
+    fclose(f);
+  return 0;
+}
 
-    else if (mm_is_pattern(*matcode))
-    {
-        ret_code = mm_read_mtx_crd_data(f, *M, *N, *nz, *I, *J, *val, 
-                *matcode);
-        if (ret_code != 0) return ret_code;
-    }
+int mm_write_banner(FILE *f, MM_typecode matcode) {
+  char *str = mm_typecode_to_str(matcode);
+  int ret_code;
 
-    if (f != stdin) fclose(f);
+  ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
+  free(str);
+  if (ret_code != 2)
+    return MM_COULD_NOT_WRITE_FILE;
+  else
     return 0;
 }
 
-int mm_write_banner(FILE *f, MM_typecode matcode)
-{
-    char *str = mm_typecode_to_str(matcode);
-    int ret_code;
-
-    ret_code = fprintf(f, "%s %s\n", MatrixMarketBanner, str);
-    free(str);
-    if (ret_code !=2 )
-        return MM_COULD_NOT_WRITE_FILE;
-    else
-        return 0;
-}
-
 int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-        double val[], MM_typecode matcode)
-{
-    FILE *f;
-    int i;
-
-    if (strcmp(fname, "stdout") == 0) 
-        f = stdout;
-    else
-    if ((f = fopen(fname, "w")) == NULL)
-        return MM_COULD_NOT_WRITE_FILE;
-    
-    /* print banner followed by typecode */
-    fprintf(f, "%s ", MatrixMarketBanner);
-    fprintf(f, "%s\n", mm_typecode_to_str(matcode));
-
-    /* print matrix sizes and nonzeros */
-    fprintf(f, "%d %d %d\n", M, N, nz);
-
-    /* print values */
-    if (mm_is_pattern(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d\n", I[i], J[i]);
-    else
-    if (mm_is_real(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
-    else
-    if (mm_is_complex(matcode))
-        for (i=0; i<nz; i++)
-            fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2*i], 
-                        val[2*i+1]);
-    else
-    {
-        if (f != stdout) fclose(f);
-        return MM_UNSUPPORTED_TYPE;
-    }
-
-    if (f !=stdout) fclose(f);
+                     double val[], MM_typecode matcode) {
+  FILE *f;
+  int i;
+
+  if (strcmp(fname, "stdout") == 0)
+    f = stdout;
+  else if ((f = fopen(fname, "w")) == NULL)
+    return MM_COULD_NOT_WRITE_FILE;
+
+  /* print banner followed by typecode */
+  fprintf(f, "%s ", MatrixMarketBanner);
+  fprintf(f, "%s\n", mm_typecode_to_str(matcode));
+
+  /* print matrix sizes and nonzeros */
+  fprintf(f, "%d %d %d\n", M, N, nz);
+
+  /* print values */
+  if (mm_is_pattern(matcode))
+    for (i = 0; i < nz; i++)
+      fprintf(f, "%d %d\n", I[i], J[i]);
+  else if (mm_is_real(matcode))
+    for (i = 0; i < nz; i++)
+      fprintf(f, "%d %d %20.16g\n", I[i], J[i], val[i]);
+  else if (mm_is_complex(matcode))
+    for (i = 0; i < nz; i++)
+      fprintf(f, "%d %d %20.16g %20.16g\n", I[i], J[i], val[2 * i],
+              val[2 * i + 1]);
+  else {
+    if (f != stdout)
+      fclose(f);
+    return MM_UNSUPPORTED_TYPE;
+  }
+
+  if (f != stdout)
+    fclose(f);
 
-    return 0;
+  return 0;
 }
-  
 
 /**
-*  Create a new copy of a string s.  mm_strdup() is a common routine, but
-*  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
-*
-*/
-char *mm_strdup(const char *s)
-{
-	int len = strlen(s);
-	char *s2 = (char *) malloc((len+1)*sizeof(char));
-	return strcpy(s2, s);
+ *  Create a new copy of a string s.  mm_strdup() is a common routine, but
+ *  not part of ANSI C, so it is included here.  Used by mm_typecode_to_str().
+ *
+ */
+char *mm_strdup(const char *s) {
+  int len = strlen(s);
+  char *s2 = (char *)malloc((len + 1) * sizeof(char));
+  return strcpy(s2, s);
 }
 
-char  *mm_typecode_to_str(MM_typecode matcode)
-{
-    char buffer[MM_MAX_LINE_LENGTH];
-    char *types[4];
-	char *mm_strdup(const char *);
-    int error =0;
-
-    /* check for MTX type */
-    if (mm_is_matrix(matcode)) 
-        types[0] = MM_MTX_STR;
-    else
-        error=1;
-
-    /* check for CRD or ARR matrix */
-    if (mm_is_sparse(matcode))
-        types[1] = MM_SPARSE_STR;
-    else
-    if (mm_is_dense(matcode))
-        types[1] = MM_DENSE_STR;
-    else
-        return NULL;
-
-    /* check for element data type */
-    if (mm_is_real(matcode))
-        types[2] = MM_REAL_STR;
-    else
-    if (mm_is_complex(matcode))
-        types[2] = MM_COMPLEX_STR;
-    else
-    if (mm_is_pattern(matcode))
-        types[2] = MM_PATTERN_STR;
-    else
-    if (mm_is_integer(matcode))
-        types[2] = MM_INT_STR;
-    else
-        return NULL;
-
-
-    /* check for symmetry type */
-    if (mm_is_general(matcode))
-        types[3] = MM_GENERAL_STR;
-    else
-    if (mm_is_symmetric(matcode))
-        types[3] = MM_SYMM_STR;
-    else 
-    if (mm_is_hermitian(matcode))
-        types[3] = MM_HERM_STR;
-    else 
-    if (mm_is_skew(matcode))
-        types[3] = MM_SKEW_STR;
-    else
-        return NULL;
-
-    sprintf(buffer,"%s %s %s %s", types[0], types[1], types[2], types[3]);
-    return mm_strdup(buffer);
-
+char *mm_typecode_to_str(MM_typecode matcode) {
+  char buffer[MM_MAX_LINE_LENGTH];
+  char *types[4];
+  char *mm_strdup(const char *);
+  int error = 0;
+
+  /* check for MTX type */
+  if (mm_is_matrix(matcode))
+    types[0] = MM_MTX_STR;
+  else
+    error = 1;
+
+  /* check for CRD or ARR matrix */
+  if (mm_is_sparse(matcode))
+    types[1] = MM_SPARSE_STR;
+  else if (mm_is_dense(matcode))
+    types[1] = MM_DENSE_STR;
+  else
+    return NULL;
+
+  /* check for element data type */
+  if (mm_is_real(matcode))
+    types[2] = MM_REAL_STR;
+  else if (mm_is_complex(matcode))
+    types[2] = MM_COMPLEX_STR;
+  else if (mm_is_pattern(matcode))
+    types[2] = MM_PATTERN_STR;
+  else if (mm_is_integer(matcode))
+    types[2] = MM_INT_STR;
+  else
+    return NULL;
+
+  /* check for symmetry type */
+  if (mm_is_general(matcode))
+    types[3] = MM_GENERAL_STR;
+  else if (mm_is_symmetric(matcode))
+    types[3] = MM_SYMM_STR;
+  else if (mm_is_hermitian(matcode))
+    types[3] = MM_HERM_STR;
+  else if (mm_is_skew(matcode))
+    types[3] = MM_SKEW_STR;
+  else
+    return NULL;
+
+  sprintf(buffer, "%s %s %s %s", types[0], types[1], types[2], types[3]);
+  return mm_strdup(buffer);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h
index 7cfd0a1b7ae7572e85b8c10bcb2fd0b3333ad0b6..ffb80cab0cc94b05a5b97d1d3b28b1b53d8c0d52 100644
--- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h
+++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/mmio.h
@@ -1,10 +1,10 @@
-/* 
-*   Matrix Market I/O library for ANSI C
-*
-*   See http://math.nist.gov/MatrixMarket for details.
-*
-*
-*/
+/*
+ *   Matrix Market I/O library for ANSI C
+ *
+ *   See http://math.nist.gov/MatrixMarket for details.
+ *
+ *
+ */
 
 #ifndef MM_IO_H
 #define MM_IO_H
@@ -25,109 +25,99 @@ int mm_write_banner(FILE *f, MM_typecode matcode);
 int mm_write_mtx_crd_size(FILE *f, int M, int N, int nz);
 int mm_write_mtx_array_size(FILE *f, int M, int N);
 
-
 /********************* MM_typecode query fucntions ***************************/
 
-#define mm_is_matrix(typecode)	((typecode)[0]=='M')
-
-#define mm_is_sparse(typecode)	((typecode)[1]=='C')
-#define mm_is_coordinate(typecode)((typecode)[1]=='C')
-#define mm_is_dense(typecode)	((typecode)[1]=='A')
-#define mm_is_array(typecode)	((typecode)[1]=='A')
+#define mm_is_matrix(typecode) ((typecode)[0] == 'M')
 
-#define mm_is_complex(typecode)	((typecode)[2]=='C')
-#define mm_is_real(typecode)		((typecode)[2]=='R')
-#define mm_is_pattern(typecode)	((typecode)[2]=='P')
-#define mm_is_integer(typecode) ((typecode)[2]=='I')
+#define mm_is_sparse(typecode) ((typecode)[1] == 'C')
+#define mm_is_coordinate(typecode) ((typecode)[1] == 'C')
+#define mm_is_dense(typecode) ((typecode)[1] == 'A')
+#define mm_is_array(typecode) ((typecode)[1] == 'A')
 
-#define mm_is_symmetric(typecode)((typecode)[3]=='S')
-#define mm_is_general(typecode)	((typecode)[3]=='G')
-#define mm_is_skew(typecode)	((typecode)[3]=='K')
-#define mm_is_hermitian(typecode)((typecode)[3]=='H')
+#define mm_is_complex(typecode) ((typecode)[2] == 'C')
+#define mm_is_real(typecode) ((typecode)[2] == 'R')
+#define mm_is_pattern(typecode) ((typecode)[2] == 'P')
+#define mm_is_integer(typecode) ((typecode)[2] == 'I')
 
-int mm_is_valid(MM_typecode matcode);		/* too complex for a macro */
+#define mm_is_symmetric(typecode) ((typecode)[3] == 'S')
+#define mm_is_general(typecode) ((typecode)[3] == 'G')
+#define mm_is_skew(typecode) ((typecode)[3] == 'K')
+#define mm_is_hermitian(typecode) ((typecode)[3] == 'H')
 
+int mm_is_valid(MM_typecode matcode); /* too complex for a macro */
 
 /********************* MM_typecode modify fucntions ***************************/
 
-#define mm_set_matrix(typecode)	((*typecode)[0]='M')
-#define mm_set_coordinate(typecode)	((*typecode)[1]='C')
-#define mm_set_array(typecode)	((*typecode)[1]='A')
-#define mm_set_dense(typecode)	mm_set_array(typecode)
-#define mm_set_sparse(typecode)	mm_set_coordinate(typecode)
-
-#define mm_set_complex(typecode)((*typecode)[2]='C')
-#define mm_set_real(typecode)	((*typecode)[2]='R')
-#define mm_set_pattern(typecode)((*typecode)[2]='P')
-#define mm_set_integer(typecode)((*typecode)[2]='I')
+#define mm_set_matrix(typecode) ((*typecode)[0] = 'M')
+#define mm_set_coordinate(typecode) ((*typecode)[1] = 'C')
+#define mm_set_array(typecode) ((*typecode)[1] = 'A')
+#define mm_set_dense(typecode) mm_set_array(typecode)
+#define mm_set_sparse(typecode) mm_set_coordinate(typecode)
 
+#define mm_set_complex(typecode) ((*typecode)[2] = 'C')
+#define mm_set_real(typecode) ((*typecode)[2] = 'R')
+#define mm_set_pattern(typecode) ((*typecode)[2] = 'P')
+#define mm_set_integer(typecode) ((*typecode)[2] = 'I')
 
-#define mm_set_symmetric(typecode)((*typecode)[3]='S')
-#define mm_set_general(typecode)((*typecode)[3]='G')
-#define mm_set_skew(typecode)	((*typecode)[3]='K')
-#define mm_set_hermitian(typecode)((*typecode)[3]='H')
+#define mm_set_symmetric(typecode) ((*typecode)[3] = 'S')
+#define mm_set_general(typecode) ((*typecode)[3] = 'G')
+#define mm_set_skew(typecode) ((*typecode)[3] = 'K')
+#define mm_set_hermitian(typecode) ((*typecode)[3] = 'H')
 
-#define mm_clear_typecode(typecode) ((*typecode)[0]=(*typecode)[1]= \
-									(*typecode)[2]=' ',(*typecode)[3]='G')
+#define mm_clear_typecode(typecode)                                            \
+  ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G')
 
 #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
 
-
 /********************* Matrix Market error codes ***************************/
 
-
-#define MM_COULD_NOT_READ_FILE	11
-#define MM_PREMATURE_EOF		12
-#define MM_NOT_MTX				13
-#define MM_NO_HEADER			14
-#define MM_UNSUPPORTED_TYPE		15
-#define MM_LINE_TOO_LONG		16
-#define MM_COULD_NOT_WRITE_FILE	17
-
+#define MM_COULD_NOT_READ_FILE 11
+#define MM_PREMATURE_EOF 12
+#define MM_NOT_MTX 13
+#define MM_NO_HEADER 14
+#define MM_UNSUPPORTED_TYPE 15
+#define MM_LINE_TOO_LONG 16
+#define MM_COULD_NOT_WRITE_FILE 17
 
 /******************** Matrix Market internal definitions ********************
 
    MM_matrix_typecode: 4-character sequence
 
-				    ojbect 		sparse/   	data        storage 
-						  		dense     	type        scheme
+                                    ojbect 		sparse/   	data
+ storage dense     	type        scheme
 
    string position:	 [0]        [1]			[2]         [3]
 
    Matrix typecode:  M(atrix)  C(oord)		R(eal)   	G(eneral)
-						        A(array)	C(omplex)   H(ermitian)
-											P(attern)   S(ymmetric)
-								    		I(nteger)	K(kew)
+                                                        A(array)
+ C(omplex)   H(ermitian) P(attern)   S(ymmetric) I(nteger)	K(kew)
 
  ***********************************************************************/
 
-#define MM_MTX_STR		"matrix"
-#define MM_ARRAY_STR	"array"
-#define MM_DENSE_STR	"array"
-#define MM_COORDINATE_STR "coordinate" 
-#define MM_SPARSE_STR	"coordinate"
-#define MM_COMPLEX_STR	"complex"
-#define MM_REAL_STR		"real"
-#define MM_INT_STR		"integer"
-#define MM_GENERAL_STR  "general"
-#define MM_SYMM_STR		"symmetric"
-#define MM_HERM_STR		"hermitian"
-#define MM_SKEW_STR		"skew-symmetric"
-#define MM_PATTERN_STR  "pattern"
-
+#define MM_MTX_STR "matrix"
+#define MM_ARRAY_STR "array"
+#define MM_DENSE_STR "array"
+#define MM_COORDINATE_STR "coordinate"
+#define MM_SPARSE_STR "coordinate"
+#define MM_COMPLEX_STR "complex"
+#define MM_REAL_STR "real"
+#define MM_INT_STR "integer"
+#define MM_GENERAL_STR "general"
+#define MM_SYMM_STR "symmetric"
+#define MM_HERM_STR "hermitian"
+#define MM_SKEW_STR "skew-symmetric"
+#define MM_PATTERN_STR "pattern"
 
 /*  high level routines */
 
 int mm_write_mtx_crd(char fname[], int M, int N, int nz, int I[], int J[],
-		 double val[], MM_typecode matcode);
+                     double val[], MM_typecode matcode);
 int mm_read_mtx_crd_data(FILE *f, int M, int N, int nz, int I[], int J[],
-		double val[], MM_typecode matcode);
+                         double val[], MM_typecode matcode);
 int mm_read_mtx_crd_entry(FILE *f, int *I, int *J, double *real, double *img,
-			MM_typecode matcode);
+                          MM_typecode matcode);
 
 int mm_read_unsymmetric_sparse(const char *fname, int *M_, int *N_, int *nz_,
-                double **val_, int **I_, int **J_);
-
-
+                               double **val_, int **I_, int **J_);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c
index 0dc09ff3709ebbbbabc11909673ed699474fea25..ab82ed41118e7a5d596cd51273e897d944e34be7 100644
--- a/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c
+++ b/hpvm/test/parboil/benchmarks/spmv/common_src/convert-dataset/test.c
@@ -1,19 +1,17 @@
 #include "convert_dataset.h"
 
 int main() {
-    float* data;
-    int* data_row_ptr, *nz_count, *data_col_index;
-    int *rows, cols, dim, nz_count_len, len;
-    
-    coo_to_jds(
-        "fidapm05.mtx", // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        4, // row padding
-        4, // warp size
-        2, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        3, // debug level [0:2]
-        &data, &data_row_ptr, &nz_count, &data_col_index,
-        &rows, &cols, &dim, &len, &nz_count_len
-    );
+  float *data;
+  int *data_row_ptr, *nz_count, *data_col_index;
+  int *rows, cols, dim, nz_count_len, len;
+
+  coo_to_jds("fidapm05.mtx", // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             4,              // row padding
+             4,              // warp size
+             2,              // pack size
+             1,              // is mirrored?
+             0,              // binary matrix
+             3,              // debug level [0:2]
+             &data, &data_row_ptr, &nz_count, &data_col_index, &rows, &cols,
+             &dim, &len, &nz_count_len);
 }
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h
index 560c32f4e992657920956c49c8c48deae8f9428c..abc849f930fb63d231d4453d2b9c07183e5758bd 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cpu/file.h
@@ -6,9 +6,9 @@
  *cr
  ***************************************************************************/
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c b/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c
index 8fa9e339b89b50a26c5cb205e80849875d75e4c8..0528323fd945bedb7d756deb61079c7fda9ce3a6 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cpu/main.c
@@ -10,118 +10,103 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "file.h"
 #include "convert_dataset.h"
+#include "file.h"
 
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
 
+  printf("CPU-based sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting two input filenames\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 1;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	
-	
-	
-	
-	printf("CPU-based sparse matrix vector multiplication****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	printf("This version maintained by Chris Rodrigues  ***********\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
-	
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//parameters declaration
-	int len;
-	int depth;
-	int dim;
-	int pad=1;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-	//vector
-	float *h_Ax_vector;
-    float *h_x_vector;
-	
-	
-    //load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	//    &h_data, &h_indices, &h_ptr,
-	//    &h_perm, &h_nzcnt);
-	int col_count;
-	coo_to_jds(
-		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-		1, // row padding
-		pad, // warp size
-		1, // pack size
-		1, // is mirrored?
-		0, // binary matrix
-		1, // debug level [0:2]
-		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-		&col_count, &dim, &len, &nzcnt_len, &depth
-	);		
-
-
-  h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-  h_x_vector=(float*)malloc(sizeof(float)*dim);
-  input_vec( parameters->inpFiles[1], h_x_vector,dim);
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-
-	
   int p, i, k;
-	//main execution
-	for(p=0;p<50;p++)
-	{
-		for (i = 0; i < dim; i++) {
-		  float sum = 0.0f;
-		  //int  bound = h_nzcnt[i / 32];
-		  int  bound = h_nzcnt[i];
-		  for(k=0;k<bound;k++ ) {
-			int j = h_ptr[k] + i;
-			int in = h_indices[j];
-
-			float d = h_data[j];
-			float t = h_x_vector[in];
-
-			sum += d*t;
-		  }
-		  h_Ax_vector[h_perm[i]] = sum;
-		}
-	}	
-
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+  // main execution
+  for (p = 0; p < 50; p++) {
+    for (i = 0; i < dim; i++) {
+      float sum = 0.0f;
+      // int  bound = h_nzcnt[i / 32];
+      int bound = h_nzcnt[i];
+      for (k = 0; k < bound; k++) {
+        int j = h_ptr[k] + i;
+        int in = h_indices[j];
+
+        float d = h_data[j];
+        float t = h_x_vector[in];
+
+        sum += d * t;
+      }
+      h_Ax_vector[h_perm[i]] = sum;
+    }
+  }
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
 
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.cc
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc
index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.cc
@@ -1,61 +1,43 @@
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
+
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm) {
+  int max_thread;
+  int max_warp;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2) {
+      max_thread = 1024;
+      max_warp = 32;
+    } else {
+      max_thread = 768;
+      max_warp = 24;
+    }
+  } else if (major == 2) {
+    max_thread = 1536;
+    max_warp = 48;
+  } else {
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+    max_warp = 48;
+  }
 
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm)
-{
-	int max_thread;
-	int max_warp;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-		{
-			max_thread=1024;
-			max_warp=32;
-		}
-		else
-		{
-			max_thread=768;
-			max_warp=24;
-		}
-	}
-	else if(major==2)
-	{
-		max_thread=1536;
-		max_warp=48;
-	}
-	else
-	{
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-		max_warp=48;
-	}
-	
-	int _grid;
-	int _thread;
-	int threads_per_sm=0;
-	if(task*pad>sm*max_thread)
-	{
-		//_grid=sm*max_block;
-		_thread=max_thread/max_block;
-		_grid=(task*pad+_thread-1)/_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task;
-	}
-	thread[0]=_thread;
-	grid[0]=_grid;
-	
+  int _grid;
+  int _thread;
+  int threads_per_sm = 0;
+  if (task * pad > sm * max_thread) {
+    //_grid=sm*max_block;
+    _thread = max_thread / max_block;
+    _grid = (task * pad + _thread - 1) / _thread;
+  } else {
+    _thread = pad;
+    _grid = task;
+  }
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h
index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/gpu_info.h
@@ -1,8 +1,3 @@
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm);
\ No newline at end of file
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm);
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h
index 075c239463c463a2b091b9d9b29647b4655ff13d..975057d6a91fc8f239ad0aa8f9b2cee4cd811e07 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda/spmv_jds.h
@@ -1,22 +1,21 @@
 
 
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
 
+// TEXTURE memory
+texture<float, 1> tex_x_float;
 
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
-  
- 
-//TEXTURE memory
-texture<float,1> tex_x_float;
-
-//constant memory
+// constant memory
 __constant__ int jds_ptr_int[5000];
 __constant__ int sh_zcnt_int[5000];
 
-__global__ void spmv_jds(float *dst_vector,
-							   const float *d_data,const int *d_index, const int *d_perm,
-							   const float *x_vec,const int *d_nzcnt,const int dem);
-							   
+__global__ void spmv_jds(float *dst_vector, const float *d_data,
+                         const int *d_index, const int *d_perm,
+                         const float *x_vec, const int *d_nzcnt, const int dem);
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.cc
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc
index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.cc
@@ -1,61 +1,43 @@
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
+
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm) {
+  int max_thread;
+  int max_warp;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2) {
+      max_thread = 1024;
+      max_warp = 32;
+    } else {
+      max_thread = 768;
+      max_warp = 24;
+    }
+  } else if (major == 2) {
+    max_thread = 1536;
+    max_warp = 48;
+  } else {
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+    max_warp = 48;
+  }
 
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm)
-{
-	int max_thread;
-	int max_warp;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-		{
-			max_thread=1024;
-			max_warp=32;
-		}
-		else
-		{
-			max_thread=768;
-			max_warp=24;
-		}
-	}
-	else if(major==2)
-	{
-		max_thread=1536;
-		max_warp=48;
-	}
-	else
-	{
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-		max_warp=48;
-	}
-	
-	int _grid;
-	int _thread;
-	int threads_per_sm=0;
-	if(task*pad>sm*max_thread)
-	{
-		//_grid=sm*max_block;
-		_thread=max_thread/max_block;
-		_grid=(task*pad+_thread-1)/_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task;
-	}
-	thread[0]=_thread;
-	grid[0]=_grid;
-	
+  int _grid;
+  int _thread;
+  int threads_per_sm = 0;
+  if (task * pad > sm * max_thread) {
+    //_grid=sm*max_block;
+    _thread = max_thread / max_block;
+    _grid = (task * pad + _thread - 1) / _thread;
+  } else {
+    _thread = pad;
+    _grid = task;
+  }
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h
index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/gpu_info.h
@@ -1,8 +1,3 @@
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm);
\ No newline at end of file
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm);
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h
index c9d3062a28dbbcc3bf073884205e1d5054d8bd1f..56ed524240725ae252af41fa43c0a43b208abdd6 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base/spmv_jds.h
@@ -1,22 +1,22 @@
 
 
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
 
+// TEXTURE memory
+texture<float, 1> tex_x_float;
 
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
-  
- 
-//TEXTURE memory
-texture<float,1> tex_x_float;
-
-//constant memory
+// constant memory
 __constant__ int jds_ptr_int[5000];
 __constant__ int sh_zcnt_int[5000];
 
-__global__ void spmv_jds_naive(float *dst_vector,
-							   const float *d_data,const int *d_index, const int *d_perm,
-							   const float *x_vec,const int *d_nzcnt,const int dem);
-							   
+__global__ void spmv_jds_naive(float *dst_vector, const float *d_data,
+                               const int *d_index, const int *d_perm,
+                               const float *x_vec, const int *d_nzcnt,
+                               const int dem);
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.cc
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc
index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.cc
@@ -1,61 +1,43 @@
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
+
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm) {
+  int max_thread;
+  int max_warp;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2) {
+      max_thread = 1024;
+      max_warp = 32;
+    } else {
+      max_thread = 768;
+      max_warp = 24;
+    }
+  } else if (major == 2) {
+    max_thread = 1536;
+    max_warp = 48;
+  } else {
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+    max_warp = 48;
+  }
 
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm)
-{
-	int max_thread;
-	int max_warp;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-		{
-			max_thread=1024;
-			max_warp=32;
-		}
-		else
-		{
-			max_thread=768;
-			max_warp=24;
-		}
-	}
-	else if(major==2)
-	{
-		max_thread=1536;
-		max_warp=48;
-	}
-	else
-	{
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-		max_warp=48;
-	}
-	
-	int _grid;
-	int _thread;
-	int threads_per_sm=0;
-	if(task*pad>sm*max_thread)
-	{
-		//_grid=sm*max_block;
-		_thread=max_thread/max_block;
-		_grid=(task*pad+_thread-1)/_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task;
-	}
-	thread[0]=_thread;
-	grid[0]=_grid;
-	
+  int _grid;
+  int _thread;
+  int threads_per_sm = 0;
+  if (task * pad > sm * max_thread) {
+    //_grid=sm*max_block;
+    _thread = max_thread / max_block;
+    _grid = (task * pad + _thread - 1) / _thread;
+  } else {
+    _thread = pad;
+    _grid = task;
+  }
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h
index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/gpu_info.h
@@ -1,8 +1,3 @@
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm);
\ No newline at end of file
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm);
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h
index 8eef25854e60a98b14bbc0925d2d4886afafdaaf..70f7a33ae9bb6f0af9e59c153bba308c842273f7 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_base_tex/spmv_jds.h
@@ -1,22 +1,22 @@
 
 
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
 
+// TEXTURE memory
+texture<float, 1> tex_x_float;
 
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
-  
- 
-//TEXTURE memory
-texture<float,1> tex_x_float;
-
-//constant memory
+// constant memory
 __constant__ int jds_ptr_int[5000];
 __constant__ int sh_zcnt_int[5000];
 
-__global__ void spmv_jds_texture(float *dst_vector,
-							   const float *d_data,const int *d_index, const int *d_perm,
-							   const float *x_vec,const int *d_nzcnt,const int dem);
-							   
+__global__ void spmv_jds_texture(float *dst_vector, const float *d_data,
+                                 const int *d_index, const int *d_perm,
+                                 const float *x_vec, const int *d_nzcnt,
+                                 const int dem);
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.cc
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc
index dd444910173f33dcc792665fb576333eeaed1a22..b9c4014eccdb1a61c7aad06daf108830409ae163 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.cc
@@ -1,61 +1,43 @@
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
+
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm) {
+  int max_thread;
+  int max_warp;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2) {
+      max_thread = 1024;
+      max_warp = 32;
+    } else {
+      max_thread = 768;
+      max_warp = 24;
+    }
+  } else if (major == 2) {
+    max_thread = 1536;
+    max_warp = 48;
+  } else {
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+    max_warp = 48;
+  }
 
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm)
-{
-	int max_thread;
-	int max_warp;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-		{
-			max_thread=1024;
-			max_warp=32;
-		}
-		else
-		{
-			max_thread=768;
-			max_warp=24;
-		}
-	}
-	else if(major==2)
-	{
-		max_thread=1536;
-		max_warp=48;
-	}
-	else
-	{
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-		max_warp=48;
-	}
-	
-	int _grid;
-	int _thread;
-	int threads_per_sm=0;
-	if(task*pad>sm*max_thread)
-	{
-		//_grid=sm*max_block;
-		_thread=max_thread/max_block;
-		_grid=(task*pad+_thread-1)/_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task;
-	}
-	thread[0]=_thread;
-	grid[0]=_grid;
-	
+  int _grid;
+  int _thread;
+  int threads_per_sm = 0;
+  if (task * pad > sm * max_thread) {
+    //_grid=sm*max_block;
+    _thread = max_thread / max_block;
+    _grid = (task * pad + _thread - 1) / _thread;
+  } else {
+    _thread = pad;
+    _grid = task;
+  }
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h
index 6523dd2274622460bfdb8eec03b67e831b9d63aa..39f0f541af0cfdd7e23abbab49e773cf69d7ec36 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/gpu_info.h
@@ -1,8 +1,3 @@
-void compute_active_thread(unsigned int *thread,
-					unsigned int *grid,
-					int task,
-					int pad,
-					int major,
-					int minor,
-					int warp_size,
-					int sm);
\ No newline at end of file
+void compute_active_thread(unsigned int *thread, unsigned int *grid, int task,
+                           int pad, int major, int minor, int warp_size,
+                           int sm);
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h
index 8eef25854e60a98b14bbc0925d2d4886afafdaaf..70f7a33ae9bb6f0af9e59c153bba308c842273f7 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/cuda_tex/spmv_jds.h
@@ -1,22 +1,22 @@
 
 
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
 
+// TEXTURE memory
+texture<float, 1> tex_x_float;
 
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
-  
- 
-//TEXTURE memory
-texture<float,1> tex_x_float;
-
-//constant memory
+// constant memory
 __constant__ int jds_ptr_int[5000];
 __constant__ int sh_zcnt_int[5000];
 
-__global__ void spmv_jds_texture(float *dst_vector,
-							   const float *d_data,const int *d_index, const int *d_perm,
-							   const float *x_vec,const int *d_nzcnt,const int dem);
-							   
+__global__ void spmv_jds_texture(float *dst_vector, const float *d_data,
+                                 const int *d_index, const int *d_perm,
+                                 const float *x_vec, const int *d_nzcnt,
+                                 const int dem);
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h
index 560c32f4e992657920956c49c8c48deae8f9428c..abc849f930fb63d231d4453d2b9c07183e5758bd 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/file.h
@@ -6,9 +6,9 @@
  *cr
  ***************************************************************************/
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c
index 8046b490c4091a102b385013940a45198adf81a0..aa4c36e3e4f8962d81d97c074498f7b44ad06224 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/omp_base/main.c
@@ -10,26 +10,21 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "file.h"
 #include "convert_dataset.h"
+#include "file.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);	
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
   int i;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-	}
-	return 0;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
 /*
-void jdsmv(int height, int len, float* value, int* perm, int* jds_ptr, int* col_index, float* vector,
-        float* result){
-        int i;
-        int col,row;
-        int row_index =0;
+void jdsmv(int height, int len, float* value, int* perm, int* jds_ptr, int*
+col_index, float* vector, float* result){ int i; int col,row; int row_index =0;
         int prem_indicator=0;
         for (i=0; i<len; i++){
                 if (i>=jds_ptr[prem_indicator+1]){
@@ -47,120 +42,105 @@ void jdsmv(int height, int len, float* value, int* perm, int* jds_ptr, int* col_
         return;
 }
 */
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	
-	
-	
-	
-	printf("CPU-based sparse matrix vector multiplication****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	printf("This version maintained by Chris Rodrigues  ***********\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-      fprintf(stderr, "Expecting two input filenames\n");
-      exit(-1);
-    }
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CPU-based sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting two input filenames\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 1;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+
+  int col_count;
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  //  generate_vector(h_x_vector, dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-	
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//parameters declaration
-	int len;
-	int depth;
-	int dim;
-	int pad=1;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-	//vector
-	float *h_Ax_vector;
-    float *h_x_vector;
-	
-	
-    //load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	//    &h_data, &h_indices, &h_ptr,
-	//    &h_perm, &h_nzcnt);
-
- 
-
-	int col_count;
-	coo_to_jds(
-		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-		1, // row padding
-		pad, // warp size
-		1, // pack size
-		1, // is mirrored?
-		0, // binary matrix
-		1, // debug level [0:2]
-		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-		&col_count, &dim, &len, &nzcnt_len, &depth
-	);		
-
-  h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-  h_x_vector=(float*)malloc(sizeof(float)*dim);
-//  generate_vector(h_x_vector, dim);
-  input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	
   int p, i;
-	//main execution
-	for(p=0;p<50;p++)
-	{
-    #pragma omp parallel for
-		for (i = 0; i < dim; i++) {
+  // main execution
+  for (p = 0; p < 50; p++) {
+#pragma omp parallel for
+    for (i = 0; i < dim; i++) {
       int k;
-		  float sum = 0.0f;
-		  //int  bound = h_nzcnt[i / 32];
-		  int  bound = h_nzcnt[i];
-		  for(k=0;k<bound;k++ ) {
-			int j = h_ptr[k] + i;
-			int in = h_indices[j];
-
-			float d = h_data[j];
-			float t = h_x_vector[in];
-
-			sum += d*t;
-		  }
-    //  #pragma omp critical 
-		  h_Ax_vector[h_perm[i]] = sum;
-		}
-	}	
-
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
-
+      float sum = 0.0f;
+      // int  bound = h_nzcnt[i / 32];
+      int bound = h_nzcnt[i];
+      for (k = 0; k < bound; k++) {
+        int j = h_ptr[k] + i;
+        int in = h_indices[j];
+
+        float d = h_data[j];
+        float t = h_x_vector[in];
+
+        sum += d * t;
+      }
+      //  #pragma omp critical
+      h_Ax_vector[h_perm[i]] = sum;
+    }
+  }
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c
index 6a9f404808baf25bf985524f0b90ad0eafc8cda0..4bc1b3f79a52e77a7c0524fedc9cd3c8c5137b7a 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.c
@@ -6,10 +6,10 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
@@ -17,27 +17,20 @@
  * Workgroup is multiple of 64 threads
  * Max threads 265
  */
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad)
-{
-	int max_thread=496*64;
-	int max_block=256;
-	int _grid;
-	int _thread;
-	
-	if(task*pad>max_thread)
-	{
-		_thread= max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad) {
+  int max_thread = 496 * 64;
+  int max_block = 256;
+  int _grid;
+  int _thread;
+
+  if (task * pad > max_thread) {
+    _thread = max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h
index 4a061ca31c6af45d5940d9b221fb188408127367..fe1c5cb6c23e3ef6a08da51320d6c565fd28d5d7 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/gpu_info.h
@@ -9,9 +9,6 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c
index 6b5d046583944217fe3a72767944f45a89e73e23..22c1b51753fd8591300d7fd5200dd346e0e8e058 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/main.c
@@ -8,272 +8,292 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 #include <string.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	//x_vector[0] = 1.0;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-		
-		//x_vector[i] = 1.0;
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  // x_vector[0] = 1.0;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+
+    // x_vector[i] = 1.0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL base sparse matrix vector multiplication****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	printf("Optimized for ATI 5000 series by Ian Wetherbee <wetherb1@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    	{
-      		fprintf(stderr, "Expecting two input filenames\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//parameters declaration
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-	
-	const char* clSource[] = {readFile("src/opencl_ati/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"-Werror");
-	clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-	if (clStatus != CL_SUCCESS) {
-		size_t paramSize = 1024*1024, paramRet;
-		char* paramValue;
-		paramValue = (char*) calloc(paramSize, sizeof(char));
-		clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize, paramValue, &paramRet);
-		printf(paramValue);
-		return -1;
-	}
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_vec",&clStatus);
-	CHECK_ERROR("clCreateKernel")
-
-	int len;
-	int depth;
-	int dim;
-	int pad=64;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-	//vector
-	float *h_Ax_vector;
-	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_ptr;
-	cl_mem d_perm;
-	cl_mem d_nzcnt;
-
-	//vector
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-	
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-
-	// HACK: remove the .bin from the end of data, remove later
-	//parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00;
-	printf("Input file %s\n", parameters->inpFiles[1]);
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	int col_count;
-	int warp_size=64;
-	coo_to_jds(
-		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-		1, // row padding
-		warp_size, // warp size
-		1, // pack size
-		1, // is mirrored?
-		0, // binary matrix
-		1, // debug level [0:2]
-		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-		&col_count, &dim, &len, &nzcnt_len, &depth
-	);
-	printf("Executing kernel...\n");
-	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	//    &h_data, &h_indices, &h_ptr,
-	//    &h_perm, &h_nzcnt);
-	
-  h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-  h_x_vector=(float*)malloc(sizeof(float)*dim);
-  input_vec( parameters->inpFiles[1],h_x_vector,dim);
-  //generate_vector(h_x_vector,dim) ;
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-
-	/*
-    	OpenCLDeviceProp clDeviceProp;
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-	CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	*/
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-
-	compute_active_thread(&block,&grid,nzcnt_len,pad);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,8,sizeof(int),&warp_size);
-        CHECK_ERROR("clSetKernelArg")
-	
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-	
-	int i;
-	for (i=0; i<50; i++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-	clStatus = clReleaseMemObject(d_nzcnt);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	printf("Output has %d entries\n", dim);
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-                //int temp = ((dim + 31)/32)*32;
-		outputData(parameters->outFile,h_Ax_vector,dim);
-	}
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-	
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL base sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("Optimized for ATI 5000 series by Ian Wetherbee "
+         "<wetherb1@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting two input filenames\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // parameters declaration
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_ati/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-Werror");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  if (clStatus != CL_SUCCESS) {
+    size_t paramSize = 1024 * 1024, paramRet;
+    char *paramValue;
+    paramValue = (char *)calloc(paramSize, sizeof(char));
+    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize,
+                          paramValue, &paramRet);
+    printf(paramValue);
+    return -1;
+  }
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_vec", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 64;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+  cl_mem d_nzcnt;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // HACK: remove the .bin from the end of data, remove later
+  // parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00;
+  printf("Input file %s\n", parameters->inpFiles[1]);
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  int col_count;
+  int warp_size = 64;
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             warp_size,               // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+  printf("Executing kernel...\n");
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+  // generate_vector(h_x_vector,dim) ;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  /*
+  OpenCLDeviceProp clDeviceProp;
+  clStatus =
+  clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+  clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+  clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  */
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &warp_size);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_nzcnt);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  printf("Output has %d entries\n", dim);
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    // int temp = ((dim + 31)/32)*32;
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c
index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.c
@@ -1,48 +1,45 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*size);
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        fclose(fp);
-        return buffer;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c
index d59d91d0a4310d165aaf925ced2d768cd6a74a12..4bc1b3f79a52e77a7c0524fedc9cd3c8c5137b7a 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.c
@@ -6,10 +6,10 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
@@ -17,27 +17,20 @@
  * Workgroup is multiple of 64 threads
  * Max threads 265
  */
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad)
-{
-	int max_thread=496*64;
-	int max_block=256;
-	int _grid;
-	int _thread;
-	
-	if(task*pad>max_thread)
-	{
-		_thread=max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad) {
+  int max_thread = 496 * 64;
+  int max_block = 256;
+  int _grid;
+  int _thread;
+
+  if (task * pad > max_thread) {
+    _thread = max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h
index 4a061ca31c6af45d5940d9b221fb188408127367..fe1c5cb6c23e3ef6a08da51320d6c565fd28d5d7 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/gpu_info.h
@@ -9,9 +9,6 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c
index 960d0406ca7ad3ec0460cd74ce35f477d2aab78e..ff6dd4138e4f7ab2421dc25c15bf9453f8691e83 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/main.c
@@ -8,273 +8,292 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 #include <string.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	//x_vector[0] = 1.0;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-		
-		//x_vector[i] = 1.0;
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  // x_vector[0] = 1.0;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+
+    // x_vector[i] = 1.0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	printf("Optimized for ATI 5000 series by Ian Wetherbee <wetherb1@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    	{
-      		fprintf(stderr, "Expecting two input filenames\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//parameters declaration
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-	
-	const char* clSource[] = {readFile("src/opencl_ati_vec/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"-Werror");
-	clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-	if (clStatus != CL_SUCCESS) {
-		size_t paramSize = 1024*1024, paramRet;
-		char* paramValue;
-		paramValue = (char*) calloc(paramSize, sizeof(char));
-		clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize, paramValue, &paramRet);
-		printf(paramValue);
-		return -1;
-	}
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_vec",&clStatus);
-	CHECK_ERROR("clCreateKernel")
-
-	int len;
-	int depth;
-	int dim;
-	int pad=64;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-	//vector
-	float *h_Ax_vector;
-	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_ptr;
-	cl_mem d_perm;
-	cl_mem d_nzcnt;
-
-	//vector
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-	
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-
-	// HACK: remove the .bin from the end of data, remove later
-	//parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00;
-	printf("Input file %s\n", parameters->inpFiles[0]);
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	int col_count;
-	int warp_size=64;
-	coo_to_jds(
-		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-		1, // row padding
-		warp_size, // warp size, IMPORTANT: change in kernel as well
-		4, // pack size
-		1, // is mirrored?
-		0, // binary matrix
-		1, // debug level [0:2]
-		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-		&col_count, &dim, &len, &nzcnt_len, &depth
-	);
-	printf("Executing kernel...\n");
-	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	//    &h_data, &h_indices, &h_ptr,
-	//    &h_perm, &h_nzcnt);
-	
-  h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-  h_x_vector=(float*)malloc(sizeof(float)*dim);
-  input_vec( parameters->inpFiles[1],h_x_vector,dim);
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-
-	/*
-    	OpenCLDeviceProp clDeviceProp;
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-	CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	*/
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-
-	compute_active_thread(&block,&grid,nzcnt_len,pad);
- 
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,8,sizeof(int),&warp_size);
-        CHECK_ERROR("clSetKernelArg")
-	
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-	
-	int i;
-	for (i=0; i<50; i++)
-	{
-
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-	clStatus = clReleaseMemObject(d_nzcnt);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	printf("Output has %d entries\n", dim);
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-                //int temp = ((dim + 31)/32)*32;
-		outputData(parameters->outFile,h_Ax_vector,dim);
-	}
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-	
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("Optimized for ATI 5000 series by Ian Wetherbee "
+         "<wetherb1@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting two input filenames\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // parameters declaration
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_ati_vec/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-Werror");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  if (clStatus != CL_SUCCESS) {
+    size_t paramSize = 1024 * 1024, paramRet;
+    char *paramValue;
+    paramValue = (char *)calloc(paramSize, sizeof(char));
+    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, paramSize,
+                          paramValue, &paramRet);
+    printf(paramValue);
+    return -1;
+  }
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_vec", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 64;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+  cl_mem d_nzcnt;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // HACK: remove the .bin from the end of data, remove later
+  // parameters->inpFiles[0][strlen(parameters->inpFiles[0])-4] = 0x00;
+  printf("Input file %s\n", parameters->inpFiles[0]);
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  int col_count;
+  int warp_size = 64;
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             warp_size, // warp size, IMPORTANT: change in kernel as well
+             4,         // pack size
+             1,         // is mirrored?
+             0,         // binary matrix
+             1,         // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+  printf("Executing kernel...\n");
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  /*
+  OpenCLDeviceProp clDeviceProp;
+  clStatus =
+  clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+  clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+  clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  */
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 8, sizeof(int), &warp_size);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_nzcnt);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  printf("Output has %d entries\n", dim);
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    // int temp = ((dim + 31)/32)*32;
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c
index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.c
@@ -1,48 +1,45 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*size);
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        fclose(fp);
-        return buffer;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_ati_vec/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c
index 964dab2864a63f675db9614eea856cab46f6d6cc..1bc75a9bd092f0b454eca1e52d691c9f99ba49cb 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/main.c
@@ -8,256 +8,275 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
-
-
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("CUDA accelerated sparse matrix vector multiplication****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	printf("This version maintained by Chris Rodrigues  ***********\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    	{
-      		fprintf(stderr, "Expecting one input filename\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//parameters declaration
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-	
-	const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"");
-	clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);
-	CHECK_ERROR("clCreateKernel")
-
-	int len;
-	int depth;
-	int dim;
-	int pad=32;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-	//vector
-	float *h_Ax_vector;
-	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_ptr;
-	cl_mem d_perm;
-	cl_mem d_nzcnt;
-
-	//vector
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-	
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	//    &h_data, &h_indices, &h_ptr,
-	//    &h_perm, &h_nzcnt);
-	int col_count;
-	coo_to_jds(
-		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-		1, // row padding
-		pad, // warp size
-		1, // pack size
-		1, // is mirrored?
-		0, // binary matrix
-		1, // debug level [0:2]
-		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-		&col_count, &dim, &len, &nzcnt_len, &depth
-	);
-	
-//	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	h_Ax_vector=(float*)malloc(sizeof(float)*dim);	
-	h_x_vector=(float*)malloc(sizeof(float)*dim);	
-	
-  input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-	 pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    	OpenCLDeviceProp clDeviceProp;
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-	//CHECK_ERROR("clGetDeviceInfo")
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CUDA accelerated sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // parameters declaration
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_naive", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+  cl_mem d_nzcnt;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  //	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  OpenCLDeviceProp clDeviceProp;
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  // CHECK_ERROR("clGetDeviceInfo")
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
   //      CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-
-	compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-//  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block);
-//  printf("!!! dim is %d\n",dim);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-	int i;
-	for (i=0; i<50; i++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-	clStatus = clReleaseMemObject(d_nzcnt);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-	}
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-	
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+  //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
+  //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_nzcnt);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_base/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c
index a471cb53938d231012a340633dfa3d3ae8845739..a19184a9659eaa91223da57e1b926ac6bff54b4e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/main.c
@@ -8,277 +8,294 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
-
-
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("CUDA accelerated sparse matrix vector multiplication****\n");
-    printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    printf("This version maintained by Chris Rodrigues  ***********\n");
-    parameters = pb_ReadParameters(&argc, argv);
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one input filename\n");
-        exit(-1);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CUDA accelerated sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  //	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // parameters declaration
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue =
+      clCreateCommandQueue(clContext, clDevice, 0, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
+  /*cl_program clProgram =
+   * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
+  /*CHECK_ERROR("clCreateProgramWithSource")*/
+
+  /*char clOptions[50];*/
+  /*sprintf(clOptions,"");*/
+  /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
+  /*CHECK_ERROR("clBuildProgram")*/
+
+  /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/
+  /*CHECK_ERROR("clCreateKernel")*/
+
+  cl_kernel clKernel;
+  cl_program clProgram;
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds_naive", &clContext,
+                                    &clDevice, &clProgram, &clKernel);
+
+  OpenCLDeviceProp clDeviceProp;
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  // CHECK_ERROR("clGetDeviceInfo")
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+  //      CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+  //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
+  //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  int i;
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int j = 0; j < 20; j++) {
+    for (i = 0; i < 50; i++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL,
+                                        &grid, &block, 0, NULL, NULL);
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
-
-    //load matrix from files
+    clStatus = clFinish(clCommandQueue);
+    CHECK_ERROR("clFinish")
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(sh_zcnt_int);
+  clStatus = clReleaseMemObject(jds_ptr_int);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-    
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-    
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-//	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //parameters declaration
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //device memory allocation
-    //matrix
-    cl_mem d_data;
-    cl_mem d_indices;
-    cl_mem d_ptr;
-    cl_mem d_perm;
-
-    //vector
-    cl_mem d_Ax_vector;
-    cl_mem d_x_vector;
-
-    cl_mem jds_ptr_int;
-    cl_mem sh_zcnt_int;
-
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
-    /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
-    /*CHECK_ERROR("clCreateProgramWithSource")*/
-
-    /*char clOptions[50];*/
-    /*sprintf(clOptions,"");*/
-    /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
-    /*CHECK_ERROR("clBuildProgram")*/
-
-    /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/
-    /*CHECK_ERROR("clCreateKernel")*/
-
-    cl_kernel clKernel;
-    cl_program clProgram;
-    pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds_naive", &clContext, &clDevice, &clProgram, &clKernel);
-
-    OpenCLDeviceProp clDeviceProp;
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-    //CHECK_ERROR("clGetDeviceInfo")
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-    //      CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory allocation
-    d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-//  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block);
-//  printf("!!! dim is %d\n",dim);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    int i;
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for(int j=0; j<20; j++) {
-      for (i=0; i<50; i++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-      }
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
-    }
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //HtoD memory copy
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-
-    clStatus = clReleaseMemObject(d_data);
-    clStatus = clReleaseMemObject(d_indices);
-    clStatus = clReleaseMemObject(d_perm);
-    clStatus = clReleaseMemObject(sh_zcnt_int);
-    clStatus = clReleaseMemObject(jds_ptr_int);
-    clStatus = clReleaseMemObject(d_x_vector);
-    clStatus = clReleaseMemObject(d_Ax_vector);
-    CHECK_ERROR("clReleaseMemObject")
-
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
-    }
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    /*free((void*)clSource[0]);*/
+  /*free((void*)clSource[0]);*/
 
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
-    pb_FreeParameters(parameters);
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c
index 1812bc12b7563259eb9797c00ac93a7a5e9210d9..d4fc026b73894e47c94dd7f2c9ef8f31e366eec6 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/main.c
@@ -8,271 +8,288 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
-
-
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("CUDA accelerated sparse matrix vector multiplication****\n");
-    printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    printf("This version maintained by Chris Rodrigues  ***********\n");
-    parameters = pb_ReadParameters(&argc, argv);
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one input filename\n");
-        exit(-1);
-    }
-
-    //load matrix from files
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CUDA accelerated sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  //	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // parameters declaration
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue =
+      clCreateCommandQueue(clContext, clDevice, 0, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  const char *clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_naive", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  /*cl_kernel clKernel;*/
+  /*cl_program clProgram;*/
+  /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext,
+   * &clDevice, &clProgram, &clKernel);*/
+
+  OpenCLDeviceProp clDeviceProp;
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  // CHECK_ERROR("clGetDeviceInfo")
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+  //      CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+  //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
+  //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+
+  int i;
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  /*for(int j=0; j<20; j++) {*/
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+  /*}*/
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(sh_zcnt_int);
+  clStatus = clReleaseMemObject(jds_ptr_int);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-    
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-    
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-//	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //parameters declaration
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    //device memory allocation
-    //matrix
-    cl_mem d_data;
-    cl_mem d_indices;
-    cl_mem d_ptr;
-    cl_mem d_perm;
-
-    //vector
-    cl_mem d_Ax_vector;
-    cl_mem d_x_vector;
-
-    cl_mem jds_ptr_int;
-    cl_mem sh_zcnt_int;
-
-
-    const char* clSource[] = {readFile("src/opencl_cpu_baseline/kernel.cl")};
-    cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    CHECK_ERROR("clCreateProgramWithSource")
-
-    char clOptions[50];
-    sprintf(clOptions,"");
-    clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    CHECK_ERROR("clBuildProgram")
-
-    cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-    /*cl_kernel clKernel;*/
-    /*cl_program clProgram;*/
-    /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/
-
-    OpenCLDeviceProp clDeviceProp;
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-    //CHECK_ERROR("clGetDeviceInfo")
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-    //      CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory allocation
-    d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-//  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block);
-//  printf("!!! dim is %d\n",dim);
-
-    clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-
-    int i;
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    /*for(int j=0; j<20; j++) {*/
-      for (i=0; i<50; i++)
-      {
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-      }
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
-    /*}*/
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //HtoD memory copy
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-
-    clStatus = clReleaseMemObject(d_data);
-    clStatus = clReleaseMemObject(d_indices);
-    clStatus = clReleaseMemObject(d_perm);
-    clStatus = clReleaseMemObject(sh_zcnt_int);
-    clStatus = clReleaseMemObject(jds_ptr_int);
-    clStatus = clReleaseMemObject(d_x_vector);
-    clStatus = clReleaseMemObject(d_Ax_vector);
-    CHECK_ERROR("clReleaseMemObject")
-
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
-    }
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-    /*free((void*)clSource[0]);*/
-
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
-    pb_FreeParameters(parameters);
-
-    return 0;
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+  /*free((void*)clSource[0]);*/
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_baseline/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c
index 7b26d543aab0b480975111ff2a06a8cbd103de34..42ffab597d028eacba7f9975473908bdf812524e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/main.c
@@ -8,277 +8,294 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
-
-
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("CUDA accelerated sparse matrix vector multiplication****\n");
-    printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    printf("This version maintained by Chris Rodrigues  ***********\n");
-    parameters = pb_ReadParameters(&argc, argv);
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one input filename\n");
-        exit(-1);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CUDA accelerated sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  //	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // parameters declaration
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue =
+      clCreateCommandQueue(clContext, clDevice, 0, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
+  /*cl_program clProgram =
+   * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
+  /*CHECK_ERROR("clCreateProgramWithSource")*/
+
+  /*char clOptions[50];*/
+  /*sprintf(clOptions,"");*/
+  /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
+  /*CHECK_ERROR("clBuildProgram")*/
+
+  /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/
+  /*CHECK_ERROR("clCreateKernel")*/
+
+  cl_kernel clKernel;
+  cl_program clProgram;
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext,
+                                    &clDevice, &clProgram, &clKernel);
+
+  OpenCLDeviceProp clDeviceProp;
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  // CHECK_ERROR("clGetDeviceInfo")
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+  //      CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+  //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
+  //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  int i;
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int j = 0; j < 1; j++) {
+    for (i = 0; i < 50; i++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL,
+                                        &grid, &block, 0, NULL, NULL);
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
-
-    //load matrix from files
+    clStatus = clFinish(clCommandQueue);
+    CHECK_ERROR("clFinish")
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(sh_zcnt_int);
+  clStatus = clReleaseMemObject(jds_ptr_int);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-    
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-    
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-//	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //parameters declaration
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //device memory allocation
-    //matrix
-    cl_mem d_data;
-    cl_mem d_indices;
-    cl_mem d_ptr;
-    cl_mem d_perm;
-
-    //vector
-    cl_mem d_Ax_vector;
-    cl_mem d_x_vector;
-
-    cl_mem jds_ptr_int;
-    cl_mem sh_zcnt_int;
-
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
-    /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
-    /*CHECK_ERROR("clCreateProgramWithSource")*/
-
-    /*char clOptions[50];*/
-    /*sprintf(clOptions,"");*/
-    /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
-    /*CHECK_ERROR("clBuildProgram")*/
-
-    /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/
-    /*CHECK_ERROR("clCreateKernel")*/
-
-    cl_kernel clKernel;
-    cl_program clProgram;
-    pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);
-
-    OpenCLDeviceProp clDeviceProp;
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-    //CHECK_ERROR("clGetDeviceInfo")
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-    //      CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory allocation
-    d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-//  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block);
-//  printf("!!! dim is %d\n",dim);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    int i;
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for(int j=0; j<1; j++) {
-      for (i=0; i<50; i++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-      }
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
-    }
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //HtoD memory copy
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-
-    clStatus = clReleaseMemObject(d_data);
-    clStatus = clReleaseMemObject(d_indices);
-    clStatus = clReleaseMemObject(d_perm);
-    clStatus = clReleaseMemObject(sh_zcnt_int);
-    clStatus = clReleaseMemObject(jds_ptr_int);
-    clStatus = clReleaseMemObject(d_x_vector);
-    clStatus = clReleaseMemObject(d_Ax_vector);
-    CHECK_ERROR("clReleaseMemObject")
-
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
-    }
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    /*free((void*)clSource[0]);*/
+  /*free((void*)clSource[0]);*/
 
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
-    pb_FreeParameters(parameters);
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_huge/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c
index 2f1bff5bb31a0ac0ab980998b681247e3c94d7f2..fbd272b32f7f60fbd0c651b0f329550b47e4db27 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/main.c
@@ -8,277 +8,294 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
-
-
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("CUDA accelerated sparse matrix vector multiplication****\n");
-    printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    printf("This version maintained by Chris Rodrigues  ***********\n");
-    parameters = pb_ReadParameters(&argc, argv);
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one input filename\n");
-        exit(-1);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CUDA accelerated sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  //	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // parameters declaration
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue =
+      clCreateCommandQueue(clContext, clDevice, 0, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
+  /*cl_program clProgram =
+   * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
+  /*CHECK_ERROR("clCreateProgramWithSource")*/
+
+  /*char clOptions[50];*/
+  /*sprintf(clOptions,"");*/
+  /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
+  /*CHECK_ERROR("clBuildProgram")*/
+
+  /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/
+  /*CHECK_ERROR("clCreateKernel")*/
+
+  cl_kernel clKernel;
+  cl_program clProgram;
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext,
+                                    &clDevice, &clProgram, &clKernel);
+
+  OpenCLDeviceProp clDeviceProp;
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  // CHECK_ERROR("clGetDeviceInfo")
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+  //      CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+  //  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is
+  //  %d=\n",grid,block); printf("!!! dim is %d\n",dim);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  int i;
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int j = 0; j < 20; j++) {
+    for (i = 0; i < 50; i++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL,
+                                        &grid, &block, 0, NULL, NULL);
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
-
-    //load matrix from files
+    clStatus = clFinish(clCommandQueue);
+    CHECK_ERROR("clFinish")
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(sh_zcnt_int);
+  clStatus = clReleaseMemObject(jds_ptr_int);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  if (parameters->outFile) {
     /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-    
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-    
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-//	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //parameters declaration
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,0,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //device memory allocation
-    //matrix
-    cl_mem d_data;
-    cl_mem d_indices;
-    cl_mem d_ptr;
-    cl_mem d_perm;
-
-    //vector
-    cl_mem d_Ax_vector;
-    cl_mem d_x_vector;
-
-    cl_mem jds_ptr_int;
-    cl_mem sh_zcnt_int;
-
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    /*const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};*/
-    /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
-    /*CHECK_ERROR("clCreateProgramWithSource")*/
-
-    /*char clOptions[50];*/
-    /*sprintf(clOptions,"");*/
-    /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
-    /*CHECK_ERROR("clBuildProgram")*/
-
-    /*cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);*/
-    /*CHECK_ERROR("clCreateKernel")*/
-
-    cl_kernel clKernel;
-    cl_program clProgram;
-    pb_CreateAndBuildKernelFromBinary("kernel.ir", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);
-
-    OpenCLDeviceProp clDeviceProp;
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-    //CHECK_ERROR("clGetDeviceInfo")
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-    //      CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory allocation
-    d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-//  printf("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!grid is %d and block is %d=\n",grid,block);
-//  printf("!!! dim is %d\n",dim);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    int i;
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for(int j=0; j<20; j++) {
-      for (i=0; i<50; i++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-      }
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
-    }
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //HtoD memory copy
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-
-    clStatus = clReleaseMemObject(d_data);
-    clStatus = clReleaseMemObject(d_indices);
-    clStatus = clReleaseMemObject(d_perm);
-    clStatus = clReleaseMemObject(sh_zcnt_int);
-    clStatus = clReleaseMemObject(jds_ptr_int);
-    clStatus = clReleaseMemObject(d_x_vector);
-    clStatus = clReleaseMemObject(d_Ax_vector);
-    CHECK_ERROR("clReleaseMemObject")
-
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
-    }
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
 
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    /*free((void*)clSource[0]);*/
+  /*free((void*)clSource[0]);*/
 
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
-    pb_FreeParameters(parameters);
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_cpu_large/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
index c2a43eb0d171640ac33b7e711ab21990b5462af0..343814149aa74139930380c2178e2f447c64e806 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/main.c
@@ -8,262 +8,276 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one two filenames\n");
+    exit(-1);
+  }
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+  /*cl_kernel clKernel;*/
+  /*cl_program clProgram;*/
+  /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s",
+   * "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_perm;
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  OpenCLDeviceProp clDeviceProp;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(sh_zcnt_int);
+  clStatus = clReleaseMemObject(jds_ptr_int);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+  // free((void*)clSource[0]);
 
-    printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-    printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
 
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one two filenames\n");
-        exit(-1);
-    }
+  pb_FreeParameters(parameters);
 
-    //load matrix from files
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-
-    //parameters declaration
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-    cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    CHECK_ERROR("clCreateProgramWithSource")
-
-    char clOptions[50];
-    sprintf(clOptions,"");
-    clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    CHECK_ERROR("clBuildProgram")
-
-    cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-    /*cl_kernel clKernel;*/
-    /*cl_program clProgram;*/
-    /*pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_default/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);*/
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //device memory allocation
-    //matrix
-    cl_mem d_data;
-    cl_mem d_indices;
-    cl_mem d_perm;
-    cl_mem d_Ax_vector;
-    cl_mem d_x_vector;
-
-    cl_mem jds_ptr_int;
-    cl_mem sh_zcnt_int;
-
-
-    OpenCLDeviceProp clDeviceProp;
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    //memory allocation
-    d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-    clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    int i;
-    for(i=0; i<50; i++)
-    {
-        clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-        CHECK_ERROR("clEnqueueNDRangeKernel")
-    }
-
-    clStatus = clFinish(clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-
-    //HtoD memory copy
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-
-    clStatus = clReleaseMemObject(d_data);
-    clStatus = clReleaseMemObject(d_indices);
-    clStatus = clReleaseMemObject(d_perm);
-    clStatus = clReleaseMemObject(sh_zcnt_int);
-    clStatus = clReleaseMemObject(jds_ptr_int);
-    clStatus = clReleaseMemObject(d_x_vector);
-    clStatus = clReleaseMemObject(d_Ax_vector);
-
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    pb_PrintTimerSet(&timers);
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
-
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-    //free((void*)clSource[0]);
-
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
-
-    pb_FreeParameters(parameters);
-
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c
index 7e1c15d72919e3ba2cee94ad1fd4254b3325f1a8..4600a3e6b8d580ad6fc3986d24a712ad592e25eb 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/main.c
@@ -8,279 +8,293 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-    printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one two filenames\n");
-        exit(-1);
-    }
-
-    //load matrix from files
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-
-    //parameters declaration
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-    printf("Col count = %d, dim = %d\n", col_count, dim);
-
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-    //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
-
-    //char clOptions[50];
-    //sprintf(clOptions,"");
-    //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
-
-    //cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
-    cl_kernel clKernel;
-    cl_program clProgram;
-    pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_huge_default/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //device memory allocation
-    //matrix
-    cl_mem d_data;
-    cl_mem d_indices;
-    cl_mem d_perm;
-    cl_mem d_Ax_vector;
-    cl_mem d_x_vector;
-
-    cl_mem jds_ptr_int;
-    cl_mem sh_zcnt_int;
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    OpenCLDeviceProp clDeviceProp;
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth, nzcnt_len);
-    long totalmem = 8*(len*sizeof(float)
-                +len*sizeof(int)
-                +dim*sizeof(int)
-                +2*dim*sizeof(float)
-                +depth*sizeof(int)
-                +nzcnt_len*sizeof(int));
-    printf("total mem = %f MB\n", totalmem/(1024.0*1024));
-    //memory allocation
-    d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    int i;
-    for (int j=0; j<5; j++) {
-      for(i=0; i<50; i++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-      }
-
-
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one two filenames\n");
+    exit(-1);
+  }
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  printf("Col count = %d, dim = %d\n", col_count, dim);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
+
+  // char clOptions[50];
+  // sprintf(clOptions,"");
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
+
+  // cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
+  cl_kernel clKernel;
+  cl_program clProgram;
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_nvidia_huge_default/kernel_offline.nvptx.s", "spmv_jds",
+      &clContext, &clDevice, &clProgram, &clKernel);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_perm;
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  OpenCLDeviceProp clDeviceProp;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth,
+         nzcnt_len);
+  long totalmem = 8 * (len * sizeof(float) + len * sizeof(int) +
+                       dim * sizeof(int) + 2 * dim * sizeof(float) +
+                       depth * sizeof(int) + nzcnt_len * sizeof(int));
+  printf("total mem = %f MB\n", totalmem / (1024.0 * 1024));
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  int i;
+  for (int j = 0; j < 5; j++) {
+    for (i = 0; i < 50; i++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL,
+                                        &grid, &block, 0, NULL, NULL);
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    clStatus = clFinish(clCommandQueue);
+    CHECK_ERROR("clFinish")
+  }
 
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-    //HtoD memory copy
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
 
-    clStatus = clReleaseMemObject(d_data);
-    clStatus = clReleaseMemObject(d_indices);
-    clStatus = clReleaseMemObject(d_perm);
-    clStatus = clReleaseMemObject(sh_zcnt_int);
-    clStatus = clReleaseMemObject(jds_ptr_int);
-    clStatus = clReleaseMemObject(d_x_vector);
-    clStatus = clReleaseMemObject(d_Ax_vector);
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(sh_zcnt_int);
+  clStatus = clReleaseMemObject(jds_ptr_int);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
 
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_PrintTimerSet(&timers);
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
-
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_PrintTimerSet(&timers);
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
 
-    pb_FreeParameters(parameters);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h
index 9f082b2fb607495f9d527acbc6727134d3a8d353..9c4f12027d979367d53cc378450d100ebc51780f 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_huge/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s! ErrorCode = %d\n",errorMessage, clStatus);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s! ErrorCode = %d\n", errorMessage, clStatus);             \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c
index 5e43b4da93a08719a1bde55f426f5ac3f10cc49d..d2375af91dd8d4812fcb82b78b856e85feda376f 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/main.c
@@ -8,279 +8,293 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-    printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one two filenames\n");
-        exit(-1);
-    }
-
-    //load matrix from files
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-
-    //parameters declaration
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-    printf("Col count = %d, dim = %d\n", col_count, dim);
-
-
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlatformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-    //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
-
-    //char clOptions[50];
-    //sprintf(clOptions,"");
-    //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
-
-    //cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
-    cl_kernel clKernel;
-    cl_program clProgram;
-    pb_CreateAndBuildKernelFromBinary("build/opencl_nvidia_large_default/kernel_offline.nvptx.s", "spmv_jds", &clContext, &clDevice, &clProgram, &clKernel);
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    //device memory allocation
-    //matrix
-    cl_mem d_data;
-    cl_mem d_indices;
-    cl_mem d_perm;
-    cl_mem d_Ax_vector;
-    cl_mem d_x_vector;
-
-    cl_mem jds_ptr_int;
-    cl_mem sh_zcnt_int;
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    OpenCLDeviceProp clDeviceProp;
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-    clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-    CHECK_ERROR("clGetDeviceInfo")
-
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth, nzcnt_len);
-    long totalmem = 8*(len*sizeof(float)
-                +len*sizeof(int)
-                +dim*sizeof(int)
-                +2*dim*sizeof(float)
-                +depth*sizeof(int)
-                +nzcnt_len*sizeof(int));
-    printf("total mem = %f MB\n", totalmem/(1024.0*1024));
-    //memory allocation
-    d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,depth*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nzcnt_len*sizeof(int),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-    CHECK_ERROR("clSetKernelArg")
-    clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    int i;
-    for (int j=0; j<100; j++) {
-      for(i=0; i<50; i++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-      }
-
-
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one two filenames\n");
+    exit(-1);
+  }
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  printf("Col count = %d, dim = %d\n", col_count, dim);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  // const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
+
+  // char clOptions[50];
+  // sprintf(clOptions,"");
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
+
+  // cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
+  cl_kernel clKernel;
+  cl_program clProgram;
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_nvidia_large_default/kernel_offline.nvptx.s", "spmv_jds",
+      &clContext, &clDevice, &clProgram, &clKernel);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_perm;
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  OpenCLDeviceProp clDeviceProp;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  printf("len = %d, dim = %d, depth = %d, nzcnt_len = %d\n", len, dim, depth,
+         nzcnt_len);
+  long totalmem = 8 * (len * sizeof(float) + len * sizeof(int) +
+                       dim * sizeof(int) + 2 * dim * sizeof(float) +
+                       depth * sizeof(int) + nzcnt_len * sizeof(int));
+  printf("total mem = %f MB\n", totalmem / (1024.0 * 1024));
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, depth * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                               nzcnt_len * sizeof(int), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  int i;
+  for (int j = 0; j < 100; j++) {
+    for (i = 0; i < 50; i++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL,
+                                        &grid, &block, 0, NULL, NULL);
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    clStatus = clFinish(clCommandQueue);
+    CHECK_ERROR("clFinish")
+  }
 
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-    //HtoD memory copy
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
 
-    clStatus = clReleaseMemObject(d_data);
-    clStatus = clReleaseMemObject(d_indices);
-    clStatus = clReleaseMemObject(d_perm);
-    clStatus = clReleaseMemObject(sh_zcnt_int);
-    clStatus = clReleaseMemObject(jds_ptr_int);
-    clStatus = clReleaseMemObject(d_x_vector);
-    clStatus = clReleaseMemObject(d_Ax_vector);
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(sh_zcnt_int);
+  clStatus = clReleaseMemObject(jds_ptr_int);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
 
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    pb_PrintTimerSet(&timers);
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
-
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_PrintTimerSet(&timers);
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
 
-    pb_FreeParameters(parameters);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h
index 9f082b2fb607495f9d527acbc6727134d3a8d353..9c4f12027d979367d53cc378450d100ebc51780f 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_nvidia_large/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s! ErrorCode = %d\n",errorMessage, clStatus);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s! ErrorCode = %d\n", errorMessage, clStatus);             \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c
index 1736545153135192f84735bb2888f148870df143..a18ed997526039c2292bb31255f8ac2fbe47915d 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/main.c
@@ -8,273 +8,292 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-	printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    	{
-      		fprintf(stderr, "Expecting two input filenames\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-
-	const char* clSource[] = {readFile("src/opencl_tex/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"");
-	clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-	
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_texture",&clStatus);
-	CHECK_ERROR("clCreateKernel")		
-
-	//parameters declaration
-	int len;
-	int depth;
-	int dim;
-	int pad=32;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-
-	//vector
-	float *h_Ax_vector;
-    	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_perm;
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-	
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	//    &h_data, &h_indices, &h_ptr,
-	//    &h_perm, &h_nzcnt);
-	int col_count;
-	coo_to_jds(
-		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-		1, // row padding
-		pad, // warp size
-		1, // pack size
-		1, // is mirrored?
-		0, // binary matrix
-		1, // debug level [0:2]
-		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-		&col_count, &dim, &len, &nzcnt_len, &depth
-	);
-	
-
-	h_Ax_vector=(float*)malloc(sizeof(float)*dim);	
-	h_x_vector=(float*)malloc(sizeof(float)*dim);
-	input_vec( parameters->inpFiles[1],h_x_vector,dim);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting two input filenames\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_tex/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_texture", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_perm;
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-    	OpenCLDeviceProp clDeviceProp;
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-//	CHECK_ERROR("clGetDeviceInfo")
-//	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
+
+  OpenCLDeviceProp clDeviceProp;
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
+  //	CHECK_ERROR("clGetDeviceInfo")
+  //	clStatus =
+  // clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
   //      CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_IMAGE2D_MAX_WIDTH,sizeof(size_t),&(clDeviceProp.maxImgWidth),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	
-	cl_image_format clImgFmt = {CL_R,CL_FLOAT};
-
-	size_t clImgWidth;
-	size_t clImgHeight;
-	if(dim<=clDeviceProp.maxImgWidth)
-	{
-		clImgWidth = dim;
-		clImgHeight = 1;
-	}
-	else
-	{
-		clImgWidth = clDeviceProp.maxImgWidth;
-		clImgHeight = (dim+clDeviceProp.maxImgWidth-1)/clDeviceProp.maxImgWidth;
-	}
-		
-	d_x_vector = clCreateImage2D(clContext,CL_MEM_READ_ONLY,&clImgFmt,clImgWidth,clImgHeight,0,NULL,&clStatus);
-	CHECK_ERROR("clCreateImage2D")
-
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	size_t clOrigin[3] = {0,0,0};
-	size_t clRegion[3] = {clImgWidth,clImgHeight,1};
-	size_t clRowPitch = clImgWidth*sizeof(cl_float);
-	size_t clSlicePitch = 0;
-	clStatus = clEnqueueWriteImage(clCommandQueue,d_x_vector,CL_FALSE,clOrigin,clRegion,clRowPitch,clSlicePitch,h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteImage")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-    	
-	compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-	
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-	
-	int i;
-	for (i=0; i<50; i++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	 
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                             sizeof(size_t), &(clDeviceProp.maxImgWidth), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  cl_image_format clImgFmt = {CL_R, CL_FLOAT};
+
+  size_t clImgWidth;
+  size_t clImgHeight;
+  if (dim <= clDeviceProp.maxImgWidth) {
+    clImgWidth = dim;
+    clImgHeight = 1;
+  } else {
+    clImgWidth = clDeviceProp.maxImgWidth;
+    clImgHeight =
+        (dim + clDeviceProp.maxImgWidth - 1) / clDeviceProp.maxImgWidth;
+  }
+
+  d_x_vector = clCreateImage2D(clContext, CL_MEM_READ_ONLY, &clImgFmt,
+                               clImgWidth, clImgHeight, 0, NULL, &clStatus);
+  CHECK_ERROR("clCreateImage2D")
+
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  size_t clOrigin[3] = {0, 0, 0};
+  size_t clRegion[3] = {clImgWidth, clImgHeight, 1};
+  size_t clRowPitch = clImgWidth * sizeof(cl_float);
+  size_t clSlicePitch = 0;
+  clStatus = clEnqueueWriteImage(clCommandQueue, d_x_vector, CL_FALSE, clOrigin,
+                                 clRegion, clRowPitch, clSlicePitch, h_x_vector,
+                                 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteImage")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h
index b34cb1b2494bd346c335eb9ce5b306d53ff9c8ae..bbdb88fbe9818887f0af522bb7456231603275db 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex/ocl.h
@@ -2,21 +2,20 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
-	size_t maxImgWidth;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
+  size_t maxImgWidth;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.c
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c
index 4eebc0ad100e1a7e51c47268e62b134fc74c5d93..04559ee3ae2beaebf28067b346f5a93ddcbc282c 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/main.c
@@ -8,276 +8,295 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
-#include "convert_dataset.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-	printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    	{
-      		fprintf(stderr, "Expecting two input filenames\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-
-	const char* clSource[] = {readFile("src/opencl_tex_nvidia/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"");
-	clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-	
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_texture",&clStatus);
-	CHECK_ERROR("clCreateKernel")		
-
-	//parameters declaration
-	int len;
-	int depth;
-	int dim;
-	int pad=32;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-
-	//vector
-	float *h_Ax_vector;
-    	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_perm;
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-	
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	//inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	//    &h_data, &h_indices, &h_ptr,
-	//    &h_perm, &h_nzcnt);
-	int col_count;
-	coo_to_jds(
-		parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-		1, // row padding
-		pad, // warp size
-		1, // pack size
-		1, // is mirrored?
-		0, // binary matrix
-		1, // debug level [0:2]
-		&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-		&col_count, &dim, &len, &nzcnt_len, &depth
-	);
-	
-	
-	h_Ax_vector=(float*)malloc(sizeof(float)*dim);	
-	h_x_vector=(float*)malloc(sizeof(float)*dim);
-	input_vec( parameters->inpFiles[1],h_x_vector,dim);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting two input filenames\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
   pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-    	OpenCLDeviceProp clDeviceProp;
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-	CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_IMAGE2D_MAX_WIDTH,sizeof(size_t),&(clDeviceProp.maxImgWidth),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	
-	cl_image_format clImgFmt = {CL_R,CL_FLOAT};
-
-	size_t clImgWidth;
-	size_t clImgHeight;
-	if(dim<=clDeviceProp.maxImgWidth)
-	{
-		clImgWidth = dim;
-		clImgHeight = 1;
-	}
-	else
-	{
-		clImgWidth = clDeviceProp.maxImgWidth;
-		clImgHeight = (dim+clDeviceProp.maxImgWidth-1)/clDeviceProp.maxImgWidth;
-	}
-		
-	d_x_vector = clCreateImage2D(clContext,CL_MEM_READ_ONLY,&clImgFmt,clImgWidth,clImgHeight,0,NULL,&clStatus);
-	CHECK_ERROR("clCreateImage2D")
-
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	size_t clOrigin[3] = {0,0,0};
-	size_t clRegion[3] = {clImgWidth,clImgHeight,1};
-	size_t clRowPitch = clImgWidth*sizeof(cl_float);
-	size_t clSlicePitch = 0;
-	clStatus = clEnqueueWriteImage(clCommandQueue,d_x_vector,CL_FALSE,clOrigin,clRegion,clRowPitch,clSlicePitch,h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteImage")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-    	
-	compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-//	grid.x=nzcnt_len;
-//	block.x=pad;
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-	
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-	
-	int i;
-	for (i=0; i<50; i++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	 
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_tex_nvidia/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_texture", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_perm;
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  OpenCLDeviceProp clDeviceProp;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                             sizeof(size_t), &(clDeviceProp.maxImgWidth), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  cl_image_format clImgFmt = {CL_R, CL_FLOAT};
+
+  size_t clImgWidth;
+  size_t clImgHeight;
+  if (dim <= clDeviceProp.maxImgWidth) {
+    clImgWidth = dim;
+    clImgHeight = 1;
+  } else {
+    clImgWidth = clDeviceProp.maxImgWidth;
+    clImgHeight =
+        (dim + clDeviceProp.maxImgWidth - 1) / clDeviceProp.maxImgWidth;
+  }
+
+  d_x_vector = clCreateImage2D(clContext, CL_MEM_READ_ONLY, &clImgFmt,
+                               clImgWidth, clImgHeight, 0, NULL, &clStatus);
+  CHECK_ERROR("clCreateImage2D")
+
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  size_t clOrigin[3] = {0, 0, 0};
+  size_t clRegion[3] = {clImgWidth, clImgHeight, 1};
+  size_t clRowPitch = clImgWidth * sizeof(cl_float);
+  size_t clSlicePitch = 0;
+  clStatus = clEnqueueWriteImage(clCommandQueue, d_x_vector, CL_FALSE, clOrigin,
+                                 clRegion, clRowPitch, clSlicePitch, h_x_vector,
+                                 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteImage")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  //	grid.x=nzcnt_len;
+  //	block.x=pad;
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c
index 7e48e7e65426d1ba4b2edf89f6f3ae1fe33c12c7..2990031255acae7fe480b0fe7cdc79db7cb08287 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.c
@@ -1,49 +1,46 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h
index b34cb1b2494bd346c335eb9ce5b306d53ff9c8ae..bbdb88fbe9818887f0af522bb7456231603275db 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/ocl.h
@@ -2,21 +2,20 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
-	size_t maxImgWidth;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
+  size_t maxImgWidth;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc
index 58f9a3a358fcad5da79b375f5d0ec45854317bc8..a15137259e9963e43bfaa56ddeda89399e2d38d6 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.cc
@@ -7,66 +7,61 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C" 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+extern "C" void inputData(char *fName, int *len, int *depth, int *dim,
+                          int *nzcnt_len, int *pad, float **h_data,
+                          int **h_indices, int **h_ptr, int **h_perm,
+                          int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
+
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
+
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
+
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
 
-  fclose (fid); 
+  fclose(fid);
 }
 
-extern "C" 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h
index 64ef34a091ae149773ab9c5e8dfa352fbaa8d11e..e86d2ef8b66a60ae4bf5b1171ae23411dcf332d9 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/file.h
@@ -13,12 +13,11 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim, 
-               int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c
index 2b5c4b3a875ecd25954845381daf1060d03fe6b1..96f4ecc2997b33000e91abecfee8c85664cb4a12 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/main.c
@@ -8,237 +8,259 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("CUDA accelerated sparse matrix vector multiplication****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	printf("This version maintained by Chris Rodrigues  ***********\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL))
-    	{
-      		fprintf(stderr, "Expecting one input filename\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//parameters declaration
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-	
-	const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"");
-	clStatus = clBuildProgram(clProgram,0,NULL,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_naive",&clStatus);
-	CHECK_ERROR("clCreateKernel")
-
-	int len;
-	int depth;
-	int dim;
-	int pad=32;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-	//vector
-	float *h_Ax_vector;
-	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_ptr;
-	cl_mem d_perm;
-	cl_mem d_nzcnt;
-
-	//vector
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-	
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	    &h_data, &h_indices, &h_ptr,
-	    &h_perm, &h_nzcnt);
-		
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	h_Ax_vector=(float*)malloc(sizeof(float)*dim);	
-	h_x_vector=(float*)malloc(sizeof(float)*dim);	
-	generate_vector(h_x_vector, dim);
-	
-    	OpenCLDeviceProp clDeviceProp;
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-	CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-
-	compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_GPU);
-
-	int i;
-	for (i=0; i<50; i++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-	clStatus = clReleaseMemObject(d_nzcnt);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-	}
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-	
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CUDA accelerated sparse matrix vector multiplication****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and "
+         "Shengzhao Wu<wu14@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // parameters declaration
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 0, NULL, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_naive", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_ptr;
+  cl_mem d_perm;
+  cl_mem d_nzcnt;
+
+  // vector
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  inputData(parameters->inpFiles[0], &len, &depth, &dim, &nzcnt_len, &pad,
+            &h_data, &h_indices, &h_ptr, &h_perm, &h_nzcnt);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  generate_vector(h_x_vector, dim);
+
+  OpenCLDeviceProp clDeviceProp;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_GPU);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_nzcnt);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c
index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.c
@@ -1,48 +1,45 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*size);
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        fclose(fp);
-        return buffer;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_base/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc
index 58f9a3a358fcad5da79b375f5d0ec45854317bc8..a15137259e9963e43bfaa56ddeda89399e2d38d6 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.cc
@@ -7,66 +7,61 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C" 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+extern "C" void inputData(char *fName, int *len, int *depth, int *dim,
+                          int *nzcnt_len, int *pad, float **h_data,
+                          int **h_indices, int **h_ptr, int **h_perm,
+                          int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
+
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
+
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
+
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
 
-  fclose (fid); 
+  fclose(fid);
 }
 
-extern "C" 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h
index 64ef34a091ae149773ab9c5e8dfa352fbaa8d11e..e86d2ef8b66a60ae4bf5b1171ae23411dcf332d9 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/file.h
@@ -13,12 +13,11 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim, 
-               int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c
index 483281d18b32a85ceb49ee1ca34d5811a2eb7b80..218c658394428a0cb96fb702f3a08d7cede040f1 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/main.c
@@ -8,238 +8,259 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-	printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL))
-    	{
-      		fprintf(stderr, "Expecting one input filename\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-
-	const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"");
-	clStatus = clBuildProgram(clProgram,0,NULL,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds",&clStatus);
-	CHECK_ERROR("clCreateKernel")		
-
-	//parameters declaration
-	int len;
-	int depth;
-	int dim;
-	int pad=32;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-
-	//vector
-	float *h_Ax_vector;
-    	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_perm;
-	cl_mem d_nzcnt;
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-	
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	    &h_data, &h_indices, &h_ptr,
-	    &h_perm, &h_nzcnt);
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	h_Ax_vector=(float*)malloc(sizeof(float)*dim);	
-	h_x_vector=(float*)malloc(sizeof(float)*dim);
-	generate_vector(h_x_vector, dim);
-	
-    	OpenCLDeviceProp clDeviceProp;
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-	CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_x_vector = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_vector,CL_FALSE,0,dim*sizeof(int),h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-    	
-	compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_GPU);
-
-	int i;
-	for(i=0; i<50; i++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-	clStatus = clReleaseMemObject(d_nzcnt);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	 
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 0, NULL, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_perm;
+  cl_mem d_nzcnt;
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  inputData(parameters->inpFiles[0], &len, &depth, &dim, &nzcnt_len, &pad,
+            &h_data, &h_indices, &h_ptr, &h_perm, &h_nzcnt);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  generate_vector(h_x_vector, dim);
+
+  OpenCLDeviceProp clDeviceProp;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_x_vector = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(float),
+                              NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_vector, CL_FALSE, 0,
+                                  dim * sizeof(int), h_x_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_GPU);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_nzcnt);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c
index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.c
@@ -1,48 +1,45 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*size);
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        fclose(fp);
-        return buffer;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h
index 8840a8682b6e383d82bc886f448a601780b81e22..42ff7b4d1059550293b56325d0cce2afea6c004b 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_nvidia/ocl.h
@@ -2,20 +2,19 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc
index 58f9a3a358fcad5da79b375f5d0ec45854317bc8..a15137259e9963e43bfaa56ddeda89399e2d38d6 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.cc
@@ -7,66 +7,61 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C" 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+extern "C" void inputData(char *fName, int *len, int *depth, int *dim,
+                          int *nzcnt_len, int *pad, float **h_data,
+                          int **h_indices, int **h_ptr, int **h_perm,
+                          int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
+
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
+
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
+
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
 
-  fclose (fid); 
+  fclose(fid);
 }
 
-extern "C" 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h
index 64ef34a091ae149773ab9c5e8dfa352fbaa8d11e..e86d2ef8b66a60ae4bf5b1171ae23411dcf332d9 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/file.h
@@ -13,12 +13,11 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim, 
-               int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.c
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c
index 34938ab1f214896fa4d4d17f99e9625bd08acf0e..f65916d62a083facf41eca5ad7a38eb62c6ea8e9 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/main.c
@@ -8,259 +8,280 @@
 
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <parboil.h>
 
 #include "file.h"
 #include "gpu_info.h"
 #include "ocl.h"
 
-static int generate_vector(float *x_vector, int dim) 
-{	
-	srand(54321);
-	int i;
-	for(i=0;i<dim;i++)
-	{
-		x_vector[i] = (rand() / (float) RAND_MAX);
-	}
-	return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-	printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-	
-	if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL))
-    	{
-      		fprintf(stderr, "Expecting one input filename\n");
-      		exit(-1);
-    	}
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlatformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-
-	const char* clSource[] = {readFile("src/opencl_tex/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"");
-	clStatus = clBuildProgram(clProgram,0,NULL,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-	
-	cl_kernel clKernel = clCreateKernel(clProgram,"spmv_jds_texture",&clStatus);
-	CHECK_ERROR("clCreateKernel")		
-
-	//parameters declaration
-	int len;
-	int depth;
-	int dim;
-	int pad=32;
-	int nzcnt_len;
-	
-	//host memory allocation
-	//matrix
-	float *h_data;
-	int *h_indices;
-	int *h_ptr;
-	int *h_perm;
-	int *h_nzcnt;
-
-	//vector
-	float *h_Ax_vector;
-    	float *h_x_vector;
-	
-	//device memory allocation
-	//matrix
-	cl_mem d_data;
-	cl_mem d_indices;
-	cl_mem d_perm;
-	cl_mem d_Ax_vector;
-	cl_mem d_x_vector;
-
-	cl_mem jds_ptr_int;
-	cl_mem sh_zcnt_int;
-	
-    	//load matrix from files
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-	    &h_data, &h_indices, &h_ptr,
-	    &h_perm, &h_nzcnt);
-		
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	h_Ax_vector=(float*)malloc(sizeof(float)*dim);	
-	h_x_vector=(float*)malloc(sizeof(float)*dim);
-	generate_vector(h_x_vector, dim);
-	
-    	OpenCLDeviceProp clDeviceProp;
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,sizeof(cl_uint),&(clDeviceProp.major),NULL);
-	CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,sizeof(cl_uint),&(clDeviceProp.minor),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_MAX_COMPUTE_UNITS,sizeof(cl_uint),&(clDeviceProp.multiProcessorCount),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	clStatus = clGetDeviceInfo(clDevice,CL_DEVICE_IMAGE2D_MAX_WIDTH,sizeof(size_t),&(clDeviceProp.maxImgWidth),NULL);
-        CHECK_ERROR("clGetDeviceInfo")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//memory allocation
-	d_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_indices = clCreateBuffer(clContext,CL_MEM_READ_ONLY,len*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_perm = clCreateBuffer(clContext,CL_MEM_READ_ONLY,dim*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	
-	cl_image_format clImgFmt = {CL_R,CL_FLOAT};
-
-	size_t clImgWidth;
-	size_t clImgHeight;
-	if(dim<=clDeviceProp.maxImgWidth)
-	{
-		clImgWidth = dim;
-		clImgHeight = 1;
-	}
-	else
-	{
-		clImgWidth = clDeviceProp.maxImgWidth;
-		clImgHeight = (dim+clDeviceProp.maxImgWidth-1)/clDeviceProp.maxImgWidth;
-	}
-		
-	d_x_vector = clCreateImage2D(clContext,CL_MEM_READ_ONLY,&clImgFmt,clImgWidth,clImgHeight,0,NULL,&clStatus);
-	CHECK_ERROR("clCreateImage2D")
-
-	d_Ax_vector = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,dim*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	jds_ptr_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	sh_zcnt_int = clCreateBuffer(clContext,CL_MEM_READ_ONLY,5000*sizeof(int),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-
-	clMemSet(clCommandQueue,d_Ax_vector,0,dim*sizeof(float));
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_data,CL_FALSE,0,len*sizeof(float),h_data,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_indices,CL_FALSE,0,len*sizeof(int),h_indices,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_perm,CL_FALSE,0,dim*sizeof(int),h_perm,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	size_t clOrigin[3] = {0,0,0};
-	size_t clRegion[3] = {clImgWidth,clImgHeight,1};
-	size_t clRowPitch = clImgWidth*sizeof(cl_float);
-	size_t clSlicePitch = 0;
-	clStatus = clEnqueueWriteImage(clCommandQueue,d_x_vector,CL_FALSE,clOrigin,clRegion,clRowPitch,clSlicePitch,h_x_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteImage")
-
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,jds_ptr_int,CL_FALSE,0,depth*sizeof(int),h_ptr,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,sh_zcnt_int,CL_TRUE,0,nzcnt_len*sizeof(int),h_nzcnt,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	size_t grid;
-	size_t block;
-    	
-	compute_active_thread(&block,&grid,nzcnt_len,pad,clDeviceProp.major,clDeviceProp.minor,clDeviceProp.multiProcessorCount);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&d_Ax_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_data);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_indices);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),&d_perm);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&d_x_vector);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),&dim);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,6,sizeof(cl_mem),&jds_ptr_int);
-	CHECK_ERROR("clSetKernelArg")
-	clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&sh_zcnt_int);
-        CHECK_ERROR("clSetKernelArg")
-	
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_GPU);
-	
-	int i;
-	for (i=0; i<50; i++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&grid,&block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-	}
-
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//HtoD memory copy
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Ax_vector,CL_TRUE,0,dim*sizeof(float),h_Ax_vector,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")	
-
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-
-	clStatus = clReleaseMemObject(d_data);
-	clStatus = clReleaseMemObject(d_indices);
-        clStatus = clReleaseMemObject(d_perm);
-        clStatus = clReleaseMemObject(d_x_vector);
-	clStatus = clReleaseMemObject(d_Ax_vector);
-	CHECK_ERROR("clReleaseMemObject")
-
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);	
-	 
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Ax_vector,dim);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	free((void*)clSource[0]);
-
-	free (h_data);
-	free (h_indices);
-	free (h_ptr);
-	free (h_perm);
-	free (h_nzcnt);
-	free (h_Ax_vector);
-	free (h_x_vector);
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] != NULL)) {
+    fprintf(stderr, "Expecting one input filename\n");
+    exit(-1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlatformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_tex/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "");
+  clStatus = clBuildProgram(clProgram, 0, NULL, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "spmv_jds_texture", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // device memory allocation
+  // matrix
+  cl_mem d_data;
+  cl_mem d_indices;
+  cl_mem d_perm;
+  cl_mem d_Ax_vector;
+  cl_mem d_x_vector;
+
+  cl_mem jds_ptr_int;
+  cl_mem sh_zcnt_int;
+
+  // load matrix from files
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  inputData(parameters->inpFiles[0], &len, &depth, &dim, &nzcnt_len, &pad,
+            &h_data, &h_indices, &h_ptr, &h_perm, &h_nzcnt);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  generate_vector(h_x_vector, dim);
+
+  OpenCLDeviceProp clDeviceProp;
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.major), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                             sizeof(cl_uint), &(clDeviceProp.minor), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus =
+      clGetDeviceInfo(clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint),
+                      &(clDeviceProp.multiProcessorCount), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+  clStatus = clGetDeviceInfo(clDevice, CL_DEVICE_IMAGE2D_MAX_WIDTH,
+                             sizeof(size_t), &(clDeviceProp.maxImgWidth), NULL);
+  CHECK_ERROR("clGetDeviceInfo")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(float),
+                          NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_indices = clCreateBuffer(clContext, CL_MEM_READ_ONLY, len * sizeof(int),
+                             NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_perm = clCreateBuffer(clContext, CL_MEM_READ_ONLY, dim * sizeof(int), NULL,
+                          &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  cl_image_format clImgFmt = {CL_R, CL_FLOAT};
+
+  size_t clImgWidth;
+  size_t clImgHeight;
+  if (dim <= clDeviceProp.maxImgWidth) {
+    clImgWidth = dim;
+    clImgHeight = 1;
+  } else {
+    clImgWidth = clDeviceProp.maxImgWidth;
+    clImgHeight =
+        (dim + clDeviceProp.maxImgWidth - 1) / clDeviceProp.maxImgWidth;
+  }
+
+  d_x_vector = clCreateImage2D(clContext, CL_MEM_READ_ONLY, &clImgFmt,
+                               clImgWidth, clImgHeight, 0, NULL, &clStatus);
+  CHECK_ERROR("clCreateImage2D")
+
+  d_Ax_vector = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                               dim * sizeof(float), NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  jds_ptr_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  sh_zcnt_int = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 5000 * sizeof(int),
+                               NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  clMemSet(clCommandQueue, d_Ax_vector, 0, dim * sizeof(float));
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_data, CL_FALSE, 0,
+                                  len * sizeof(float), h_data, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_indices, CL_FALSE, 0,
+                                  len * sizeof(int), h_indices, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_perm, CL_FALSE, 0,
+                                  dim * sizeof(int), h_perm, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  size_t clOrigin[3] = {0, 0, 0};
+  size_t clRegion[3] = {clImgWidth, clImgHeight, 1};
+  size_t clRowPitch = clImgWidth * sizeof(cl_float);
+  size_t clSlicePitch = 0;
+  clStatus = clEnqueueWriteImage(clCommandQueue, d_x_vector, CL_FALSE, clOrigin,
+                                 clRegion, clRowPitch, clSlicePitch, h_x_vector,
+                                 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteImage")
+
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, jds_ptr_int, CL_FALSE, 0,
+                                  depth * sizeof(int), h_ptr, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, sh_zcnt_int, CL_TRUE, 0,
+                           nzcnt_len * sizeof(int), h_nzcnt, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
+                        clDeviceProp.minor, clDeviceProp.multiProcessorCount);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_data);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_indices);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), &d_perm);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &d_x_vector);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), &dim);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(cl_mem), &jds_ptr_int);
+  CHECK_ERROR("clSetKernelArg")
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(cl_mem), &sh_zcnt_int);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_GPU);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &grid,
+                                      &block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+  }
+
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // HtoD memory copy
+  clStatus =
+      clEnqueueReadBuffer(clCommandQueue, d_Ax_vector, CL_TRUE, 0,
+                          dim * sizeof(float), h_Ax_vector, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+
+  clStatus = clReleaseMemObject(d_data);
+  clStatus = clReleaseMemObject(d_indices);
+  clStatus = clReleaseMemObject(d_perm);
+  clStatus = clReleaseMemObject(d_x_vector);
+  clStatus = clReleaseMemObject(d_Ax_vector);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c
index 26df3d399da7826c39274d647d51e7aa61adf33c..93e261881f47cba8c5286ac11bfe199c5b720c45 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.c
@@ -1,48 +1,45 @@
+#include "ocl.h"
 #include <CL/cl.h>
 #include <stdio.h>
 #include <string.h>
-#include "ocl.h"
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*size);
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * size);
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        fclose(fp);
-        return buffer;
+  fclose(fp);
+  return buffer;
 }
 
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val,
+              size_t size) {
+  cl_int clStatus;
+  char *temp = (char *)malloc(size);
+  memset(temp, val, size);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, buf, CL_TRUE, 0, size, temp,
+                                  0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  free(temp);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h
index b34cb1b2494bd346c335eb9ce5b306d53ff9c8ae..bbdb88fbe9818887f0af522bb7456231603275db 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/opencl_tex_nvidia/opencl_tex/ocl.h
@@ -2,21 +2,20 @@
 #define __OCLH__
 
 typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
-	size_t maxImgWidth;
+  cl_uint major;
+  cl_uint minor;
+  cl_uint multiProcessorCount;
+  size_t maxImgWidth;
 } OpenCLDeviceProp;
 
 void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp
index 43e99f3e049704495160883afc20923bd8d61cd3..22397498f7f43b3f60926bf51c2ddbff91529787 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp
+++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.cpp
@@ -7,72 +7,66 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt)
-{
-  FILE* fid = fopen(fName, "rb");
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt) {
+  FILE *fid = fopen(fName, "rb");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
+
+  fscanf(fid, "%d %d %d %d %d\n", len, depth, nzcnt_len, dim, pad);
+  int _len = len[0];
+  int _depth = depth[0];
+  int _dim = dim[0];
+  int _pad = pad[0];
+  int _nzcnt_len = nzcnt_len[0];
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
+  *h_data = (float *)malloc(_len * sizeof(float));
+  fread(*h_data, sizeof(float), _len, fid);
 
-  fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad);
-  int _len=len[0];
-  int _depth=depth[0];
-  int _dim=dim[0];
-  int _pad=pad[0];
-  int _nzcnt_len=nzcnt_len[0];
-  
-  *h_data = (float *) malloc(_len * sizeof (float));
-  fread (*h_data, sizeof (float), _len, fid);
-  
-  *h_indices = (int *) malloc(_len * sizeof (int));
-  fread (*h_indices, sizeof (int), _len, fid);
-  
-  *h_ptr = (int *) malloc(_depth * sizeof (int));
-  fread (*h_ptr, sizeof (int), _depth, fid);
-  
-  *h_perm = (int *) malloc(_dim * sizeof (int));
-  fread (*h_perm, sizeof (int), _dim, fid);
-  
-  *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int));
-  fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid);
+  *h_indices = (int *)malloc(_len * sizeof(int));
+  fread(*h_indices, sizeof(int), _len, fid);
 
-  fclose (fid); 
+  *h_ptr = (int *)malloc(_depth * sizeof(int));
+  fread(*h_ptr, sizeof(int), _depth, fid);
+
+  *h_perm = (int *)malloc(_dim * sizeof(int));
+  fread(*h_perm, sizeof(int), _dim, fid);
+
+  *h_nzcnt = (int *)malloc(_nzcnt_len * sizeof(int));
+  fread(*h_nzcnt, sizeof(int), _nzcnt_len, fid);
+
+  fclose(fid);
 }
 
-void input_vec(char *fName,float *h_vec,int dim)
-{
-  FILE* fid = fopen(fName, "rb");
-  fread (h_vec, sizeof (float), dim, fid);
+void input_vec(char *fName, float *h_vec, int dim) {
+  FILE *fid = fopen(fName, "rb");
+  fread(h_vec, sizeof(float), dim, fid);
   fclose(fid);
-  
 }
 
-void outputData(char* fName, float *h_Ax_vector,int dim)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_Ax_vector, int dim) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
   tmp32 = dim;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_Ax_vector, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h
index 65f80135f8059d570d19366ecf1c6372a12dff62..5e38a6875e9e5f8be4d01b68569d80adf8c49548 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/file.h
@@ -8,11 +8,11 @@
 #ifndef __FILEH__
 #define __FILEH__
 
-void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad,
-               float** h_data, int** h_indices, int** h_ptr,
-               int** h_perm, int** h_nzcnt);
+void inputData(char *fName, int *len, int *depth, int *dim, int *nzcnt_len,
+               int *pad, float **h_data, int **h_indices, int **h_ptr,
+               int **h_perm, int **h_nzcnt);
 
-void input_vec(char* fNanme, float *h_vec,int dim);
-void outputData(char* fName, float *h_Ax_vector,int dim);
+void input_vec(char *fNanme, float *h_vec, int dim);
+void outputData(char *fName, float *h_Ax_vector, int dim);
 
 #endif
\ No newline at end of file
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp
index c0cc9b29aa4260548469735c8a341f2f53ad1efc..90beedd747480ede3fd1e5da4017ed0051e043be 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp
+++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.cpp
@@ -6,50 +6,39 @@
  *cr
  ***************************************************************************/
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #include "gpu_info.h"
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm) {
+  int max_thread;
+  int max_block = 8;
+  if (major == 1) {
+    if (minor >= 2)
+      max_thread = 1024;
+    else
+      max_thread = 768;
+  } else if (major == 2)
+    max_thread = 1536;
+  else
+    // newer GPU  //keep using 2.0
+    max_thread = 1536;
+
+  int _grid;
+  int _thread;
+
+  if (task * pad > sm * max_thread) {
+    _thread = max_thread / max_block;
+    _grid = ((task * pad + _thread - 1) / _thread) * _thread;
+  } else {
+    _thread = pad;
+    _grid = task * pad;
+  }
 
-	thread[0]=_thread;
-	grid[0]=_grid;
+  thread[0] = _thread;
+  grid[0] = _grid;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h
index 4219cda933c58332203b27e0ee3f8133bca2c0e8..ab1af7d0b8ba92f87c643582171e48cee0a9b95e 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h
+++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/gpu_info.h
@@ -9,12 +9,7 @@
 #ifndef __GPUINFOH__
 #define __GPUINFOH__
 
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
+void compute_active_thread(size_t *thread, size_t *grid, int task, int pad,
+                           int major, int minor, int sm);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
index eb4d4d7dbed65c13340578f8023afab1473fe962..f6ce5ccfb2412036f4eadcdab419ceca0a6c8f30 100644
--- a/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
+++ b/hpvm/test/parboil/benchmarks/spmv/src/visc/main.cpp
@@ -8,366 +8,354 @@
 
 //#include <CL/cl.h>
 //#include <CL/cl_ext.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 #include <visc.h>
 
+#include "convert_dataset.h"
 #include "file.h"
 #include "gpu_info.h"
-#include "convert_dataset.h"
 
 #define WARP_BITS 5
 
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
 typedef struct __attribute__((__packed__)) {
-    float* dst_vector; size_t bytes_dst_vector;
-    float* d_data; size_t bytes_d_data;
-    int* d_index; size_t bytes_d_index;
-    int* d_perm; size_t bytes_d_perm;
-    float* x_vec; size_t bytes_x_vec;
-    int dim;
-    int* jds_ptr_int; size_t bytes_jds_ptr_int;
-    int* sh_zcnt_int; size_t bytes_sh_zcnt_int;
-    size_t dim_X1, dim_X2;
+  float *dst_vector;
+  size_t bytes_dst_vector;
+  float *d_data;
+  size_t bytes_d_data;
+  int *d_index;
+  size_t bytes_d_index;
+  int *d_perm;
+  size_t bytes_d_perm;
+  float *x_vec;
+  size_t bytes_x_vec;
+  int dim;
+  int *jds_ptr_int;
+  size_t bytes_jds_ptr_int;
+  int *sh_zcnt_int;
+  size_t bytes_sh_zcnt_int;
+  size_t dim_X1, dim_X2;
 } RootIn;
 
-void spmv_jds(float* dst_vector, size_t bytes_dst_vector,
-              float* d_data, size_t bytes_d_data,
-              int* d_index, size_t bytes_d_index,
-              int* d_perm, size_t bytes_d_perm,
-              float* x_vec, size_t bytes_x_vec,
-              int dim,
-              int* jds_ptr_int, size_t bytes_jds_ptr_int,
-              int* sh_zcnt_int, size_t bytes_sh_zcnt_int)
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
-                       1, dst_vector);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gridx = __visc__getNumNodeInstances_x(thisNode);
-
-    int ix = gx * gridx + lx;
-    int warp_id=ix>>WARP_BITS;
-
-    if(ix<dim)
+void spmv_jds(float *dst_vector, size_t bytes_dst_vector, float *d_data,
+              size_t bytes_d_data, int *d_index, size_t bytes_d_index,
+              int *d_perm, size_t bytes_d_perm, float *x_vec,
+              size_t bytes_x_vec, int dim, int *jds_ptr_int,
+              size_t bytes_jds_ptr_int, int *sh_zcnt_int,
+              size_t bytes_sh_zcnt_int) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+                     sh_zcnt_int, 1, dst_vector);
+
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
+  int lx = __visc__getNodeInstanceID_x(thisNode);
+  int gx = __visc__getNodeInstanceID_x(parentNode);
+  int gridx = __visc__getNumNodeInstances_x(thisNode);
+
+  int ix = gx * gridx + lx;
+  int warp_id = ix >> WARP_BITS;
+
+  if (ix < dim) {
+    float sum = 0.0f;
+    int bound = sh_zcnt_int[warp_id];
+    // prefetch 0
+    int j = jds_ptr_int[0] + ix;
+    float d = d_data[j];
+    int i = d_index[j];
+    float t = x_vec[i];
+
+    if (bound > 1) // bound >=2
     {
-        float sum=0.0f;
-        int bound=sh_zcnt_int[warp_id];
-        //prefetch 0
-        int j=jds_ptr_int[0]+ix;
-        float d = d_data[j];
-        int i = d_index[j];
-        float t = x_vec[i];
-
-        if (bound>1)  //bound >=2
-        {
-            //prefetch 1
-            j=jds_ptr_int[1]+ix;
-            i =  d_index[j];
-            int in;
-            float dn;
-            float tn;
-            for(int k=2; k<bound; k++ )
-            {
-                //prefetch k-1
-                dn = d_data[j];
-                //prefetch k
-                j=jds_ptr_int[k]+ix;
-                in = d_index[j];
-                //prefetch k-1
-                tn = x_vec[i];
-
-                //compute k-2
-                sum += d*t;
-                //sweep to k
-                i = in;
-                //sweep to k-1
-                d = dn;
-                t =tn;
-            }
-
-            //fetch last
-            dn = d_data[j];
-            tn = x_vec[i];
-
-            //compute last-1
-            sum += d*t;
-            //sweep to last
-            d=dn;
-            t=tn;
-        }
-        //compute last
-        sum += d*t;  // 3 3
-
-        //write out data
-        dst_vector[d_perm[ix]]=sum;
+      // prefetch 1
+      j = jds_ptr_int[1] + ix;
+      i = d_index[j];
+      int in;
+      float dn;
+      float tn;
+      for (int k = 2; k < bound; k++) {
+        // prefetch k-1
+        dn = d_data[j];
+        // prefetch k
+        j = jds_ptr_int[k] + ix;
+        in = d_index[j];
+        // prefetch k-1
+        tn = x_vec[i];
+
+        // compute k-2
+        sum += d * t;
+        // sweep to k
+        i = in;
+        // sweep to k-1
+        d = dn;
+        t = tn;
+      }
+
+      // fetch last
+      dn = d_data[j];
+      tn = x_vec[i];
+
+      // compute last-1
+      sum += d * t;
+      // sweep to last
+      d = dn;
+      t = tn;
     }
-}
+    // compute last
+    sum += d * t; // 3 3
 
-void spmvLvl1(float* dst_vector, size_t bytes_dst_vector,
-              float* d_data, size_t bytes_d_data,
-              int* d_index, size_t bytes_d_index,
-              int* d_perm, size_t bytes_d_perm,
-              float* x_vec, size_t bytes_x_vec,
-              int dim,
-              int* jds_ptr_int, size_t bytes_jds_ptr_int,
-              int* sh_zcnt_int, size_t bytes_sh_zcnt_int,
-              size_t dim_X1)
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
-                       1, dst_vector);
-    void* spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1);
-    __visc__bindIn(spmv_node, 0, 0, 0);
-    __visc__bindIn(spmv_node, 1, 1, 0);
-    __visc__bindIn(spmv_node, 2, 2, 0);
-    __visc__bindIn(spmv_node, 3, 3, 0);
-    __visc__bindIn(spmv_node, 4, 4, 0);
-    __visc__bindIn(spmv_node, 5, 5, 0);
-    __visc__bindIn(spmv_node, 6, 6, 0);
-    __visc__bindIn(spmv_node, 7, 7, 0);
-    __visc__bindIn(spmv_node, 8, 8, 0);
-    __visc__bindIn(spmv_node, 9, 9, 0);
-    __visc__bindIn(spmv_node, 10, 10, 0);
-    __visc__bindIn(spmv_node, 11, 11, 0);
-    __visc__bindIn(spmv_node, 12, 12, 0);
-    __visc__bindIn(spmv_node, 13, 13, 0);
-    __visc__bindIn(spmv_node, 14, 14, 0);
+    // write out data
+    dst_vector[d_perm[ix]] = sum;
+  }
 }
 
-void spmvLvl2(float* dst_vector, size_t bytes_dst_vector,
-              float* d_data, size_t bytes_d_data,
-              int* d_index, size_t bytes_d_index,
-              int* d_perm, size_t bytes_d_perm,
-              float* x_vec, size_t bytes_x_vec,
-              int dim,
-              int* jds_ptr_int, size_t bytes_jds_ptr_int,
-              int* sh_zcnt_int, size_t bytes_sh_zcnt_int,
-              size_t dim_X1, size_t dim_X2)
-{
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
-                       1, dst_vector);
-    void* spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2);
-    __visc__bindIn(spmv_node, 0, 0, 0);
-    __visc__bindIn(spmv_node, 1, 1, 0);
-    __visc__bindIn(spmv_node, 2, 2, 0);
-    __visc__bindIn(spmv_node, 3, 3, 0);
-    __visc__bindIn(spmv_node, 4, 4, 0);
-    __visc__bindIn(spmv_node, 5, 5, 0);
-    __visc__bindIn(spmv_node, 6, 6, 0);
-    __visc__bindIn(spmv_node, 7, 7, 0);
-    __visc__bindIn(spmv_node, 8, 8, 0);
-    __visc__bindIn(spmv_node, 9, 9, 0);
-    __visc__bindIn(spmv_node, 10, 10, 0);
-    __visc__bindIn(spmv_node, 11, 11, 0);
-    __visc__bindIn(spmv_node, 12, 12, 0);
-    __visc__bindIn(spmv_node, 13, 13, 0);
-    __visc__bindIn(spmv_node, 14, 14, 0);
-    __visc__bindIn(spmv_node, 15, 15, 0);
+void spmvLvl1(float *dst_vector, size_t bytes_dst_vector, float *d_data,
+              size_t bytes_d_data, int *d_index, size_t bytes_d_index,
+              int *d_perm, size_t bytes_d_perm, float *x_vec,
+              size_t bytes_x_vec, int dim, int *jds_ptr_int,
+              size_t bytes_jds_ptr_int, int *sh_zcnt_int,
+              size_t bytes_sh_zcnt_int, size_t dim_X1) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+                     sh_zcnt_int, 1, dst_vector);
+  void *spmv_node = __visc__createNodeND(1, spmv_jds, dim_X1);
+  __visc__bindIn(spmv_node, 0, 0, 0);
+  __visc__bindIn(spmv_node, 1, 1, 0);
+  __visc__bindIn(spmv_node, 2, 2, 0);
+  __visc__bindIn(spmv_node, 3, 3, 0);
+  __visc__bindIn(spmv_node, 4, 4, 0);
+  __visc__bindIn(spmv_node, 5, 5, 0);
+  __visc__bindIn(spmv_node, 6, 6, 0);
+  __visc__bindIn(spmv_node, 7, 7, 0);
+  __visc__bindIn(spmv_node, 8, 8, 0);
+  __visc__bindIn(spmv_node, 9, 9, 0);
+  __visc__bindIn(spmv_node, 10, 10, 0);
+  __visc__bindIn(spmv_node, 11, 11, 0);
+  __visc__bindIn(spmv_node, 12, 12, 0);
+  __visc__bindIn(spmv_node, 13, 13, 0);
+  __visc__bindIn(spmv_node, 14, 14, 0);
 }
 
-void spmvLvl3(float* dst_vector, size_t bytes_dst_vector,
-              float* d_data, size_t bytes_d_data,
-              int* d_index, size_t bytes_d_index,
-              int* d_perm, size_t bytes_d_perm,
-              float* x_vec, size_t bytes_x_vec,
-              int dim,
-              int* jds_ptr_int, size_t bytes_jds_ptr_int,
-              int* sh_zcnt_int, size_t bytes_sh_zcnt_int,
-              size_t dim_X1, size_t dim_X2)
-{
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int, sh_zcnt_int,
-                       1, dst_vector);
-    void* spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2);
-    __visc__bindIn(spmv_node, 0, 0, 0);
-    __visc__bindIn(spmv_node, 1, 1, 0);
-    __visc__bindIn(spmv_node, 2, 2, 0);
-    __visc__bindIn(spmv_node, 3, 3, 0);
-    __visc__bindIn(spmv_node, 4, 4, 0);
-    __visc__bindIn(spmv_node, 5, 5, 0);
-    __visc__bindIn(spmv_node, 6, 6, 0);
-    __visc__bindIn(spmv_node, 7, 7, 0);
-    __visc__bindIn(spmv_node, 8, 8, 0);
-    __visc__bindIn(spmv_node, 9, 9, 0);
-    __visc__bindIn(spmv_node, 10, 10, 0);
-    __visc__bindIn(spmv_node, 11, 11, 0);
-    __visc__bindIn(spmv_node, 12, 12, 0);
-    __visc__bindIn(spmv_node, 13, 13, 0);
-    __visc__bindIn(spmv_node, 14, 14, 0);
-    __visc__bindIn(spmv_node, 15, 15, 0);
-    __visc__bindIn(spmv_node, 16, 16, 0);
+void spmvLvl2(float *dst_vector, size_t bytes_dst_vector, float *d_data,
+              size_t bytes_d_data, int *d_index, size_t bytes_d_index,
+              int *d_perm, size_t bytes_d_perm, float *x_vec,
+              size_t bytes_x_vec, int dim, int *jds_ptr_int,
+              size_t bytes_jds_ptr_int, int *sh_zcnt_int,
+              size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+                     sh_zcnt_int, 1, dst_vector);
+  void *spmv_node = __visc__createNodeND(1, spmvLvl1, dim_X2);
+  __visc__bindIn(spmv_node, 0, 0, 0);
+  __visc__bindIn(spmv_node, 1, 1, 0);
+  __visc__bindIn(spmv_node, 2, 2, 0);
+  __visc__bindIn(spmv_node, 3, 3, 0);
+  __visc__bindIn(spmv_node, 4, 4, 0);
+  __visc__bindIn(spmv_node, 5, 5, 0);
+  __visc__bindIn(spmv_node, 6, 6, 0);
+  __visc__bindIn(spmv_node, 7, 7, 0);
+  __visc__bindIn(spmv_node, 8, 8, 0);
+  __visc__bindIn(spmv_node, 9, 9, 0);
+  __visc__bindIn(spmv_node, 10, 10, 0);
+  __visc__bindIn(spmv_node, 11, 11, 0);
+  __visc__bindIn(spmv_node, 12, 12, 0);
+  __visc__bindIn(spmv_node, 13, 13, 0);
+  __visc__bindIn(spmv_node, 14, 14, 0);
+  __visc__bindIn(spmv_node, 15, 15, 0);
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated sparse matrix vector multiplication****\n");
-    printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao Wu<wu14@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-    if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL))
-    {
-        fprintf(stderr, "Expecting one two filenames\n");
-        exit(-1);
-    }
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-    //parameters declaration
-    int len;
-    int depth;
-    int dim;
-    int pad=32;
-    int nzcnt_len;
-
-    //host memory allocation
-    //matrix
-    float *h_data;
-    int *h_indices;
-    int *h_ptr;
-    int *h_perm;
-    int *h_nzcnt;
-
-    //vector
-    float *h_Ax_vector;
-    float *h_x_vector;
-
-    //load matrix from files
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
-    //    &h_data, &h_indices, &h_ptr,
-    //    &h_perm, &h_nzcnt);
-    int col_count;
-
-    coo_to_jds(
-        parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
-        1, // row padding
-        pad, // warp size
-        1, // pack size
-        1, // is mirrored?
-        0, // binary matrix
-        1, // debug level [0:2]
-        &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm,
-        &col_count, &dim, &len, &nzcnt_len, &depth
-    );
-
-    h_Ax_vector=(float*)malloc(sizeof(float)*dim);
-    h_x_vector=(float*)malloc(sizeof(float)*dim);
-    input_vec( parameters->inpFiles[1],h_x_vector,dim);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memset(h_Ax_vector, 0, dim*sizeof(float));
-
-    size_t grid;
-    size_t block;
-
-    compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(h_Ax_vector, dim*sizeof(float));
-    llvm_visc_track_mem(h_data, len*sizeof(float));
-    llvm_visc_track_mem(h_indices, len*sizeof(int));
-    llvm_visc_track_mem(h_perm, dim*sizeof(int));
-    llvm_visc_track_mem(h_x_vector, dim*sizeof(float));
-    llvm_visc_track_mem(h_ptr, depth*sizeof(int));
-    llvm_visc_track_mem(h_nzcnt, nzcnt_len*sizeof(int));
+void spmvLvl3(float *dst_vector, size_t bytes_dst_vector, float *d_data,
+              size_t bytes_d_data, int *d_index, size_t bytes_d_index,
+              int *d_perm, size_t bytes_d_perm, float *x_vec,
+              size_t bytes_x_vec, int dim, int *jds_ptr_int,
+              size_t bytes_jds_ptr_int, int *sh_zcnt_int,
+              size_t bytes_sh_zcnt_int, size_t dim_X1, size_t dim_X2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(7, dst_vector, d_data, d_index, d_perm, x_vec, jds_ptr_int,
+                     sh_zcnt_int, 1, dst_vector);
+  void *spmv_node = __visc__createNodeND(1, spmvLvl2, dim_X2);
+  __visc__bindIn(spmv_node, 0, 0, 0);
+  __visc__bindIn(spmv_node, 1, 1, 0);
+  __visc__bindIn(spmv_node, 2, 2, 0);
+  __visc__bindIn(spmv_node, 3, 3, 0);
+  __visc__bindIn(spmv_node, 4, 4, 0);
+  __visc__bindIn(spmv_node, 5, 5, 0);
+  __visc__bindIn(spmv_node, 6, 6, 0);
+  __visc__bindIn(spmv_node, 7, 7, 0);
+  __visc__bindIn(spmv_node, 8, 8, 0);
+  __visc__bindIn(spmv_node, 9, 9, 0);
+  __visc__bindIn(spmv_node, 10, 10, 0);
+  __visc__bindIn(spmv_node, 11, 11, 0);
+  __visc__bindIn(spmv_node, 12, 12, 0);
+  __visc__bindIn(spmv_node, 13, 13, 0);
+  __visc__bindIn(spmv_node, 14, 14, 0);
+  __visc__bindIn(spmv_node, 15, 15, 0);
+  __visc__bindIn(spmv_node, 16, 16, 0);
+}
 
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated sparse matrix vector multiplication****\n");
+  printf("Li-Wen Chang <lchang20@illinois.edu> and Shengzhao "
+         "Wu<wu14@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) {
+    fprintf(stderr, "Expecting one two filenames\n");
+    exit(-1);
+  }
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+  // parameters declaration
+  int len;
+  int depth;
+  int dim;
+  int pad = 32;
+  int nzcnt_len;
+
+  // host memory allocation
+  // matrix
+  float *h_data;
+  int *h_indices;
+  int *h_ptr;
+  int *h_perm;
+  int *h_nzcnt;
+
+  // vector
+  float *h_Ax_vector;
+  float *h_x_vector;
+
+  // load matrix from files
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  // inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad,
+  //    &h_data, &h_indices, &h_ptr,
+  //    &h_perm, &h_nzcnt);
+  int col_count;
+
+  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
+             1,                       // row padding
+             pad,                     // warp size
+             1,                       // pack size
+             1,                       // is mirrored?
+             0,                       // binary matrix
+             1,                       // debug level [0:2]
+             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
+             &len, &nzcnt_len, &depth);
+
+  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
+  h_x_vector = (float *)malloc(sizeof(float) * dim);
+  input_vec(parameters->inpFiles[1], h_x_vector, dim);
+
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memset(h_Ax_vector, 0, dim * sizeof(float));
+
+  size_t grid;
+  size_t block;
+
+  compute_active_thread(&block, &grid, nzcnt_len, pad, 3, 0, 8);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
+  llvm_visc_track_mem(h_Ax_vector, dim * sizeof(float));
+  llvm_visc_track_mem(h_data, len * sizeof(float));
+  llvm_visc_track_mem(h_indices, len * sizeof(int));
+  llvm_visc_track_mem(h_perm, dim * sizeof(int));
+  llvm_visc_track_mem(h_x_vector, dim * sizeof(float));
+  llvm_visc_track_mem(h_ptr, depth * sizeof(int));
+  llvm_visc_track_mem(h_nzcnt, nzcnt_len * sizeof(int));
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  int i;
+  for (i = 0; i < 50; i++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    //main execution
+    void *root_in = malloc(sizeof(RootIn));
+    RootIn root_in_local = {h_Ax_vector,
+                            dim * sizeof(float),
+                            h_data,
+                            len * sizeof(float),
+                            h_indices,
+                            len * sizeof(int),
+                            h_perm,
+                            dim * sizeof(int),
+                            h_x_vector,
+                            dim * sizeof(float),
+                            dim,
+                            h_ptr,
+                            depth * sizeof(int),
+                            h_nzcnt,
+                            nzcnt_len * sizeof(int),
+                            block,
+                            (grid / block)};
+    *(RootIn *)root_in = root_in_local;
+    void *spmvDFG = __visc__launch(0, spmvLvl3, root_in);
+
+    __visc__wait(spmvDFG);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    int i;
-    for(i=0; i<50; i++)
-    {
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-        void* root_in = malloc(sizeof(RootIn));
-        RootIn root_in_local = {
-            h_Ax_vector, dim * sizeof(float),
-            h_data, len * sizeof(float),
-            h_indices, len * sizeof(int),
-            h_perm, dim * sizeof(int),
-            h_x_vector, dim * sizeof(float),
-            dim,
-            h_ptr, depth * sizeof(int),
-            h_nzcnt, nzcnt_len * sizeof(int),
-            block, (grid/block)
-        };
-        *(RootIn*)root_in = root_in_local;
-        void* spmvDFG = __visc__launch(0, spmvLvl3, root_in);
-       
-        __visc__wait(spmvDFG);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-        /******************************* Issues *******************************
-         * 1. Using OpenCL to compute grid and block dimensions
-         *    (getting device info)
-         *    We need to check the GPU version (major number) where this kernel
-         *    executes to compare against opencl_nvidia version
-         * 2. Type of cl_mem buffer for d_x_vector is created with size of float,
-              but copied in through size of int.
-              Due to type of h_x_vector, I chose to use float
-         *    (Minor)
-         * 3. Kernel initially used constant memory for last two arguments - removed
-         */
-    }
-
-    //HtoD memory copy
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    llvm_visc_request_mem(h_Ax_vector, dim*sizeof(float));
-
-
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
-
-    llvm_visc_untrack_mem(h_Ax_vector);
-    llvm_visc_untrack_mem(h_data);
-    llvm_visc_untrack_mem(h_indices);
-    llvm_visc_untrack_mem(h_perm);
-    llvm_visc_untrack_mem(h_x_vector);
-    llvm_visc_untrack_mem(h_ptr);
-    llvm_visc_untrack_mem(h_nzcnt);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+    /******************************* Issues *******************************
+     * 1. Using OpenCL to compute grid and block dimensions
+     *    (getting device info)
+     *    We need to check the GPU version (major number) where this kernel
+     *    executes to compare against opencl_nvidia version
+     * 2. Type of cl_mem buffer for d_x_vector is created with size of float,
+          but copied in through size of int.
+          Due to type of h_x_vector, I chose to use float
+     *    (Minor)
+     * 3. Kernel initially used constant memory for last two arguments - removed
+     */
+  }
+
+  // HtoD memory copy
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(h_Ax_vector, dim * sizeof(float));
+
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+
+  llvm_visc_untrack_mem(h_Ax_vector);
+  llvm_visc_untrack_mem(h_data);
+  llvm_visc_untrack_mem(h_indices);
+  llvm_visc_untrack_mem(h_perm);
+  llvm_visc_untrack_mem(h_x_vector);
+  llvm_visc_untrack_mem(h_ptr);
+  llvm_visc_untrack_mem(h_nzcnt);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  __visc__cleanup();
+
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Ax_vector, dim);
+  }
 
-    pb_PrintTimerSet(&timers);
-    __visc__cleanup();
+  free(h_data);
+  free(h_indices);
+  free(h_ptr);
+  free(h_perm);
+  free(h_nzcnt);
+  free(h_Ax_vector);
+  free(h_x_vector);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Ax_vector,dim);
+  pb_FreeParameters(parameters);
 
-    }
- 
-    free (h_data);
-    free (h_indices);
-    free (h_ptr);
-    free (h_perm);
-    free (h_nzcnt);
-    free (h_Ax_vector);
-    free (h_x_vector);
-
-    pb_FreeParameters(parameters);
-
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc b/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc
index bf6b6dc5b877951cf6da9b27cc0c2bf4007009a8..e94700924dba991140c6db55994f3e7805f90a29 100644
--- a/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc
+++ b/hpvm/test/parboil/benchmarks/spmv/vectorgen/vectorgen.cc
@@ -1,43 +1,39 @@
 #include <endian.h>
-#include <stdlib.h>
-#include <stdio.h>
 #include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-static int generate_vector(float *x_vector, int dim)
-{
-    srand(54321);
-    int i;
-    for(i=0; i<dim; i++)
-    {
-        x_vector[i] = (rand() / (float) RAND_MAX);
-    }
-    return 0;
+static int generate_vector(float *x_vector, int dim) {
+  srand(54321);
+  int i;
+  for (i = 0; i < dim; i++) {
+    x_vector[i] = (rand() / (float)RAND_MAX);
+  }
+  return 0;
 }
 
-void outputData(char* fName, float *A0, int dim)
-{
-  FILE* fid = fopen(fName, "w");
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  
+void outputData(char *fName, float *A0, int dim) {
+  FILE *fid = fopen(fName, "w");
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+
   fwrite(A0, sizeof(float), dim, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-int main(int argc, char** argv) {
+int main(int argc, char **argv) {
 
   int dim;
   dim = atoi(argv[1]);
-  char * writefn = argv[2];
-  float *outV = (float*) malloc(dim * sizeof(float));
+  char *writefn = argv[2];
+  float *outV = (float *)malloc(dim * sizeof(float));
   generate_vector(outV, dim);
   outputData(writefn, outV, dim);
 
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c
index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.c
@@ -7,28 +7,25 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h
index 91240cd5e45d4ed14f5d0e6e4d27818a1e5cf7bc..5b09962e164174e16f9bd0294b2fb61a42b2762f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/file.h
@@ -6,6 +6,6 @@
  *cr
  ***************************************************************************/
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
+void inputData(char *fName, int *nx, int *ny, int *nz);
 
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c
index 00b3be9005b1cbb86aee7512d604a44a79191229..af00833058939f92ee5fb7e890240e47b025fb60 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.c
@@ -8,28 +8,22 @@
 
 #include "common.h"
 
-void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz)
-{
+void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx,
+                 const int ny, const int nz) {
 
   int i, j, k;
-	for(i=1;i<nx-1;i++)
-	{
-		for(j=1;j<ny-1;j++)
-		{
-			for(k=1;k<nz-1;k++)
-			{
-				Anext[Index3D (nx, ny, i, j, k)] = 
-				(A0[Index3D (nx, ny, i, j, k + 1)] +
-				A0[Index3D (nx, ny, i, j, k - 1)] +
-				A0[Index3D (nx, ny, i, j + 1, k)] +
-				A0[Index3D (nx, ny, i, j - 1, k)] +
-				A0[Index3D (nx, ny, i + 1, j, k)] +
-				A0[Index3D (nx, ny, i - 1, j, k)])*c1
-				- A0[Index3D (nx, ny, i, j, k)]*c0;
-			}
-		}
-	}
-
+  for (i = 1; i < nx - 1; i++) {
+    for (j = 1; j < ny - 1; j++) {
+      for (k = 1; k < nz - 1; k++) {
+        Anext[Index3D(nx, ny, i, j, k)] = (A0[Index3D(nx, ny, i, j, k + 1)] +
+                                           A0[Index3D(nx, ny, i, j, k - 1)] +
+                                           A0[Index3D(nx, ny, i, j + 1, k)] +
+                                           A0[Index3D(nx, ny, i, j - 1, k)] +
+                                           A0[Index3D(nx, ny, i + 1, j, k)] +
+                                           A0[Index3D(nx, ny, i - 1, j, k)]) *
+                                              c1 -
+                                          A0[Index3D(nx, ny, i, j, k)] * c0;
+      }
+    }
+  }
 }
-
-
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h
index b6735126ac8bf905d9f89b846a580e247cef4cfa..68fb021719c17d6bab318472d9be6c83665d3ab4 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/kernels.h
@@ -6,6 +6,5 @@
  *cr
  ***************************************************************************/
 
-
-
-void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz);
+void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx,
+                 const int ny, const int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c b/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c
index 9aa7fa57ab7f73694320aacb4f80d2dc6ed3f333..583b65251a6dd4050c9dcd14166869fc6c219fa6 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cpu/main.c
@@ -11,119 +11,103 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 #include "kernels.h"
 
-
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) 
-{	
-	int s=0;
-        int i, j, k;
-	for(i=0;i<nz;i++)
-	{
-		for(j=0;j<ny;j++)
-		{
-			for(k=0;k<nx;k++)
-			{
-                                fread(A0+s,sizeof(float),1,fp);
-				s++;
-			}
-		}
-	}
-	return 0;
-}
-
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-
-	
-	printf("CPU-based 7 points stencil codes****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui Sung<sung10@illinois.edu>\n");
-	printf("This version maintained by Chris Rodrigues  ***********\n");
-	parameters = pb_ReadParameters(&argc, argv);
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//declaration
-	int nx,ny,nz;
-	int size;
-    int iteration;
-	float c0=1.0f/6.0f;
-	float c1=1.0f/6.0f/6.0f;
-
-	if (argc<5) 
-    {
-      printf("Usage: probe nx ny nz tx ty t\n"
-	     "nx: the grid size x\n"
-	     "ny: the grid size y\n"
-	     "nz: the grid size z\n"
-		  "t: the iteration time\n");
-      return -1;
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
+  }
+  return 0;
+}
 
-	nx = atoi(argv[1]);
-	if (nx<1)
-		return -1;
-	ny = atoi(argv[2]);
-	if (ny<1)
-		return -1;
-	nz = atoi(argv[3]);
-	if (nz<1)
-		return -1;
-	iteration = atoi(argv[4]);
-	if(iteration<1)
-		return -1;
-
-	
-	//host data
-	float *h_A0;
-	float *h_Anext;
-
-	size=nx*ny*nz;
-	
-	h_A0=(float*)malloc(sizeof(float)*size);
-	h_Anext=(float*)malloc(sizeof(float)*size);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CPU-based 7 points stencil codes****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui "
+         "Sung<sung10@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // declaration
+  int nx, ny, nz;
+  int size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz tx ty t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
+
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
   FILE *fp = fopen(parameters->inpFiles[0], "rb");
-	read_data(h_A0, nx,ny,nz,fp);
+  read_data(h_A0, nx, ny, nz, fp);
   fclose(fp);
-  memcpy (h_Anext,h_A0 ,sizeof(float)*size);
-
-
-  
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
 
   int t;
-	for(t=0;t<iteration;t++)
-	{
-		cpu_stencil(c0,c1, h_A0, h_Anext, nx, ny,  nz);
-    float *temp=h_A0;
+  for (t = 0; t < iteration; t++) {
+    cpu_stencil(c0, c1, h_A0, h_Anext, nx, ny, nz);
+    float *temp = h_A0;
     h_A0 = h_Anext;
     h_Anext = temp;
+  }
 
-	}
-
-  float *temp=h_A0;
+  float *temp = h_A0;
   h_A0 = h_Anext;
   h_Anext = temp;
 
- 
-	if (parameters->outFile) {
-		 pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Anext,nx,ny,nz);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		
-	free (h_A0);
-	free (h_Anext);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-	return 0;
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
 
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h
index d04cada2ace5b85770a16f26f2f639ebd9eb5248..465de41bc02d805fe9a3819bd305c94bb7c9f337 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/cuerr.h
@@ -6,10 +6,11 @@
  *cr
  ***************************************************************************/
 
-
-
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc
index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.cc
@@ -7,28 +7,25 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h
index 903d42610b27bfe57703e4c032c5f508d9eb9cb3..5c7731cb348e5b3bbbe1aa1214d4f0061651e178 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/file.h
@@ -6,4 +6,4 @@
  *cr
  ***************************************************************************/
 
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h
index d548f9da3219535d6ca72b0d3a702401e61416b9..57253d774a03e40a41b1b9d587e518fe13af265e 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda/kernels.h
@@ -6,13 +6,14 @@
  *cr
  ***************************************************************************/
 
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
 
-
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
-
-
-__global__ void block2D_hybrid_coarsen_x(float fac,float *A0,float *Anext, int nx, int ny, int nz);
+__global__ void block2D_hybrid_coarsen_x(float fac, float *A0, float *Anext,
+                                         int nx, int ny, int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h
index d04cada2ace5b85770a16f26f2f639ebd9eb5248..465de41bc02d805fe9a3819bd305c94bb7c9f337 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/cuerr.h
@@ -6,10 +6,11 @@
  *cr
  ***************************************************************************/
 
-
-
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc
index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.cc
@@ -7,28 +7,25 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h
index 903d42610b27bfe57703e4c032c5f508d9eb9cb3..5c7731cb348e5b3bbbe1aa1214d4f0061651e178 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_base/file.h
@@ -6,4 +6,4 @@
  *cr
  ***************************************************************************/
 
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h
index d04cada2ace5b85770a16f26f2f639ebd9eb5248..465de41bc02d805fe9a3819bd305c94bb7c9f337 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/cuerr.h
@@ -6,10 +6,11 @@
  *cr
  ***************************************************************************/
 
-
-
-
-#define CUERR { cudaError_t err; \
-  if ((err = cudaGetLastError()) != cudaSuccess) { \
-  printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \
-  return -1; }}
+#define CUERR                                                                  \
+  {                                                                            \
+    cudaError_t err;                                                           \
+    if ((err = cudaGetLastError()) != cudaSuccess) {                           \
+      printf("CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__);  \
+      return -1;                                                               \
+    }                                                                          \
+  }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc
index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.cc
@@ -7,28 +7,25 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h
index 903d42610b27bfe57703e4c032c5f508d9eb9cb3..5c7731cb348e5b3bbbe1aa1214d4f0061651e178 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/cuda_fermi/file.h
@@ -6,4 +6,4 @@
  *cr
  ***************************************************************************/
 
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c
index 91a4c946b88f9ea27c88598f86924abfd805899a..5350a506f995b716c2d2460369e5ead5336e4361 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.c
@@ -7,28 +7,25 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h
index 91240cd5e45d4ed14f5d0e6e4d27818a1e5cf7bc..5b09962e164174e16f9bd0294b2fb61a42b2762f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/file.h
@@ -6,6 +6,6 @@
  *cr
  ***************************************************************************/
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
+void inputData(char *fName, int *nx, int *ny, int *nz);
 
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c
index 6619c8e1a8ee7e942e0937e7b2494fa5fc8fbf73..15ad898fb1d98bc7b02718ce268874c4c3a3c683 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.c
@@ -8,31 +8,25 @@
 
 #include "common.h"
 
-void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz)
-{
-
-  int i;  
-  #pragma omp parallel for
-	for(i=1;i<nx-1;i++)
-	{
-    int j,k;
-		for(j=1;j<ny-1;j++)
-		{
-			for(k=1;k<nz-1;k++)
-			{
-  //i      #pragma omp critical
-				Anext[Index3D (nx, ny, i, j, k)] = 
-				(A0[Index3D (nx, ny, i, j, k + 1)] +
-				A0[Index3D (nx, ny, i, j, k - 1)] +
-				A0[Index3D (nx, ny, i, j + 1, k)] +
-				A0[Index3D (nx, ny, i, j - 1, k)] +
-				A0[Index3D (nx, ny, i + 1, j, k)] +
-				A0[Index3D (nx, ny, i - 1, j, k)])*c1
-				- A0[Index3D (nx, ny, i, j, k)]*c0;
-			}
-		}
-	}
+void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx,
+                 const int ny, const int nz) {
 
+  int i;
+#pragma omp parallel for
+  for (i = 1; i < nx - 1; i++) {
+    int j, k;
+    for (j = 1; j < ny - 1; j++) {
+      for (k = 1; k < nz - 1; k++) {
+        // i      #pragma omp critical
+        Anext[Index3D(nx, ny, i, j, k)] = (A0[Index3D(nx, ny, i, j, k + 1)] +
+                                           A0[Index3D(nx, ny, i, j, k - 1)] +
+                                           A0[Index3D(nx, ny, i, j + 1, k)] +
+                                           A0[Index3D(nx, ny, i, j - 1, k)] +
+                                           A0[Index3D(nx, ny, i + 1, j, k)] +
+                                           A0[Index3D(nx, ny, i - 1, j, k)]) *
+                                              c1 -
+                                          A0[Index3D(nx, ny, i, j, k)] * c0;
+      }
+    }
+  }
 }
-
-
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h
index b6735126ac8bf905d9f89b846a580e247cef4cfa..68fb021719c17d6bab318472d9be6c83665d3ab4 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/kernels.h
@@ -6,6 +6,5 @@
  *cr
  ***************************************************************************/
 
-
-
-void cpu_stencil(float c0,float c1, float *A0,float * Anext,const int nx, const int ny, const int nz);
+void cpu_stencil(float c0, float c1, float *A0, float *Anext, const int nx,
+                 const int ny, const int nz);
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c
index c363d8ce732fa4d1c343f1874e5e9a283c32344a..583b65251a6dd4050c9dcd14166869fc6c219fa6 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/omp_base/main.c
@@ -11,116 +11,103 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 #include "kernels.h"
 
-
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) 
-{	
-	int s=0;
-        int i, j, k;
-	for(i=0;i<nz;i++)
-	{
-		for(j=0;j<ny;j++)
-		{
-			for(k=0;k<nx;k++)
-			{
-                                fread(A0+s,sizeof(float),1,fp);
-				s++;
-			}
-		}
-	}
-	return 0;
-}
-
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-
-	
-	printf("CPU-based 7 points stencil codes****\n");
-	printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui Sung<sung10@illinois.edu>\n");
-	printf("This version maintained by Chris Rodrigues  ***********\n");
-	parameters = pb_ReadParameters(&argc, argv);
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//declaration
-	int nx,ny,nz;
-	int size;
-    int iteration;
-	float c0=1.0f/6.0f;
-	float c1=1.0f/6.0f/6.0f;
-
-	if (argc<5) 
-    {
-      printf("Usage: probe nx ny nz tx ty t\n"
-	     "nx: the grid size x\n"
-	     "ny: the grid size y\n"
-	     "nz: the grid size z\n"
-		  "t: the iteration time\n");
-      return -1;
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
+  }
+  return 0;
+}
 
-	nx = atoi(argv[1]);
-	if (nx<1)
-		return -1;
-	ny = atoi(argv[2]);
-	if (ny<1)
-		return -1;
-	nz = atoi(argv[3]);
-	if (nz<1)
-		return -1;
-	iteration = atoi(argv[4]);
-	if(iteration<1)
-		return -1;
-
-	
-	//host data
-	float *h_A0;
-	float *h_Anext;
-
-	size=nx*ny*nz;
-	
-	h_A0=(float*)malloc(sizeof(float)*size);
-	h_Anext=(float*)malloc(sizeof(float)*size);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("CPU-based 7 points stencil codes****\n");
+  printf("Original version by Li-Wen Chang <lchang20@illinois.edu> and I-Jui "
+         "Sung<sung10@illinois.edu>\n");
+  printf("This version maintained by Chris Rodrigues  ***********\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // declaration
+  int nx, ny, nz;
+  int size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz tx ty t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
+
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
   FILE *fp = fopen(parameters->inpFiles[0], "rb");
-	read_data(h_A0, nx,ny,nz,fp);
+  read_data(h_A0, nx, ny, nz, fp);
   fclose(fp);
-  memcpy (h_Anext,h_A0 ,sizeof(float)*size);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
 
   int t;
-	for(t=0;t<iteration;t++)
-	{
-		cpu_stencil(c0,c1, h_A0, h_Anext, nx, ny,  nz);
-    float *temp=h_A0;
+  for (t = 0; t < iteration; t++) {
+    cpu_stencil(c0, c1, h_A0, h_Anext, nx, ny, nz);
+    float *temp = h_A0;
     h_A0 = h_Anext;
     h_Anext = temp;
+  }
 
-	}
-
-  float *temp=h_A0;
+  float *temp = h_A0;
   h_A0 = h_Anext;
   h_Anext = temp;
 
- 
-	if (parameters->outFile) {
-		 pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Anext,nx,ny,nz);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		
-	free (h_A0);
-	free (h_Anext);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-	return 0;
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
 
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
index 2fd7bb7b68cc3a61ad7e75c567b9998e6b151d96..ec47c22227648df094cbf03ea1b667943207207e 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base/main.c
@@ -8,226 +8,224 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    /*cl_program clProgram;*/
-    /*cl_kernel clKernel;*/
-
-    /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/
-    const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-    cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    CHECK_ERROR("clCreateProgramWithSource")
-
-    char clOptions[50];
-    sprintf(clOptions,"-I src/opencl_base");
-    clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    CHECK_ERROR("clBuildProgram")
-
-    cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
-
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    int t;
-    for(t=0; t<iteration; t++)
-    {
-        clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-        //printf("iteration %d\n",t)
-        CHECK_ERROR("clEnqueueNDRangeKernel")
-
-        cl_mem d_temp = d_A0;
-        d_A0 = d_Anext;
-        d_Anext = d_temp;
-
-        clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-        clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-
-    }
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  /*cl_program clProgram;*/
+  /*cl_kernel clKernel;*/
+
+  /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s",
+   * "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/
+  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-I src/opencl_base");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  int t;
+  for (t = 0; t < iteration; t++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    // printf("iteration %d\n",t)
+    CHECK_ERROR("clEnqueueNDRangeKernel")
 
     cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    clStatus = clFinish(clCommandQueue);
-    CHECK_ERROR("clFinish")
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  }
+
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_PrintTimerSet(&timers);
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c
index c2cdd2d6264165b057b9620b4827eb23dff92b46..61382182d1c8b406a2e2ba9dee250327914dbac4 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_default/main.c
@@ -8,239 +8,241 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    cl_program clProgram;
-    cl_kernel clKernel;
-
-    pb_CreateAndBuildKernelFromBinary("build/opencl_base_default_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
-    //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-    //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
-
-    //char clOptions[50];
-    //sprintf(clOptions,"-I src/opencl_base");
-    //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
-
-    //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for(int i =0;i <10; i++) {
-      int t;
-      for(t=0; t<iteration; t++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-          //printf("iteration %d\n",t)
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-          cl_mem d_temp = d_A0;
-          d_A0 = d_Anext;
-          d_Anext = d_temp;
-
-          clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-          clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-      }
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  cl_program clProgram;
+  cl_kernel clKernel;
+
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_base_default_default/kernel_offline.nvptx.s",
+      "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
+  // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
+
+  // char clOptions[50];
+  // sprintf(clOptions,"-I src/opencl_base");
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
+
+  // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int i = 0; i < 10; i++) {
+    int t;
+    for (t = 0; t < iteration; t++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                        block, 0, NULL, NULL);
+      // printf("iteration %d\n",t)
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+      cl_mem d_temp = d_A0;
+      d_A0 = d_Anext;
+      d_Anext = d_temp;
+
+      clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+      clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    cl_mem d_temp = d_A0;
-    d_A0 = d_Anext;
-    d_Anext = d_temp;
-
     clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
+  }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c
index 30b391963ce60b92cae0db3f2670ea49898f5196..217352e036b0d03bcc578286fd62c4339dedfe94 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_large/main.c
@@ -8,239 +8,241 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    cl_program clProgram;
-    cl_kernel clKernel;
-
-    pb_CreateAndBuildKernelFromBinary("build/opencl_base_large_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
-    //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-    //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
-
-    //char clOptions[50];
-    //sprintf(clOptions,"-I src/opencl_base");
-    //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
-
-    //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for(int i =0;i <1; i++) {
-      int t;
-      for(t=0; t<iteration; t++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-          //printf("iteration %d\n",t)
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-          cl_mem d_temp = d_A0;
-          d_A0 = d_Anext;
-          d_Anext = d_temp;
-
-          clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-          clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-      }
-      clStatus = clFinish(clCommandQueue);
-      CHECK_ERROR("clFinish")
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  cl_program clProgram;
+  cl_kernel clKernel;
+
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_base_large_default/kernel_offline.nvptx.s", "naive_kernel",
+      &clContext, &clDevice, &clProgram, &clKernel);
+  // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
+
+  // char clOptions[50];
+  // sprintf(clOptions,"-I src/opencl_base");
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
+
+  // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int i = 0; i < 1; i++) {
+    int t;
+    for (t = 0; t < iteration; t++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                        block, 0, NULL, NULL);
+      // printf("iteration %d\n",t)
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+      cl_mem d_temp = d_A0;
+      d_A0 = d_Anext;
+      d_Anext = d_temp;
+
+      clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+      clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    cl_mem d_temp = d_A0;
-    d_A0 = d_Anext;
-    d_Anext = d_temp;
-
     clStatus = clFinish(clCommandQueue);
     CHECK_ERROR("clFinish")
+  }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h
index 85c998198e9ad26c4ac912439c533ec9ca4d7ada..0d2e87b0f14004d71ecedc86e822b0fdde8d6252 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/file.h
@@ -13,9 +13,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c
index d62bf7fc6f2f21753fe2587d2c87e1ec2af6c47e..8456c3c5b8d0133b98df6150362068f704da5e1a 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_opt/main.c
@@ -8,224 +8,227 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) 
-{	
-	int s=0;
-	int i,j,k;
-	for(i=0;i<nz;i++)
-	{
-		for(j=0;j<ny;j++)
-		{
-			for(k=0;k<nx;k++)
-			{
-                                fread(A0+s,sizeof(float),1,fp);
-				s++;
-			}
-		}
-	}
-	return 0;
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
+    }
+  }
+  return 0;
 }
 
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // declaration
+  int nx, ny, nz;
+  int size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated 7 points stencil codes****\n");
-	printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//declaration
-	int nx,ny,nz;
-	int size;
-	int iteration;
-	float c0=1.0f/6.0f;
-	float c1=1.0f/6.0f/6.0f;
-
-	if (argc<5) 
-    	{
-	     printf("Usage: probe nx ny nz t\n"
-	     "nx: the grid size x\n"
-	     "ny: the grid size y\n"
-	     "nz: the grid size z\n"
-	     "t: the iteration time\n");
-	     return -1;
-	}
-
-	nx = atoi(argv[1]);
-	if (nx<1)
-		return -1;
-	ny = atoi(argv[2]);
-	if (ny<1)
-		return -1;
-	nz = atoi(argv[3]);
-	if (nz<1)
-		return -1;
-	iteration = atoi(argv[4]);
-	if(iteration<1)
-		return -1;
-	
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlaformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-
-	/*cl_program clProgram;
*/
-	/*cl_kernel clKernel;
*/
-
-	/*pb_CreateAndBuildKernelFromBinary("build/opencl_base_opt_default/kernel.nvptx.s", "block2D_hybrid_coarsen_x", &clContext, &clDevice, &clProgram, &clKernel);
*/
-
-        const char* clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
-        cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-        CHECK_ERROR("clCreateProgramWithSource")
-
-        char clOptions[50];
-        sprintf(clOptions,"-I src/opencl_base_opt");
-        clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-        CHECK_ERROR("clBuildProgram")
-
-        cl_kernel clKernel = clCreateKernel(clProgram,"block2D_hybrid_coarsen_x",&clStatus);
-        CHECK_ERROR("clCreateKernel") 			
-
-	//host data
-	float *h_A0;
-	float *h_Anext;
-	
-	//device
-	cl_mem d_A0;
-	cl_mem d_Anext;
-
-	//load data from files
-	size=nx*ny*nz;
-	
-	h_A0=(float*)malloc(sizeof(float)*size);
-	h_Anext=(float*)malloc(sizeof(float)*size);
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  /*cl_program clProgram;
*/
+  /*cl_kernel clKernel;
*/
+
+  /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_opt_default/kernel.nvptx.s",
+   * "block2D_hybrid_coarsen_x", &clContext, &clDevice, &clProgram, &clKernel);
+   */
+
+  const char *clSource[] = {readFile("src/opencl_base_opt/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-I src/opencl_base_opt");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel =
+      clCreateKernel(clProgram, "block2D_hybrid_coarsen_x", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  // load data from files
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
   FILE *fp = fopen(parameters->inpFiles[0], "rb");
-	read_data(h_A0, nx,ny,nz,fp);
+  read_data(h_A0, nx, ny, nz, fp);
   fclose(fp);
-  memcpy (h_Anext,h_A0,sizeof(float)*size);
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//memory allocation
-	d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")	
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	//only use tx by ty threads
-	int tx = 32;
-	int ty = 4;
-	size_t block[3] = {tx,ty,1};
-	
-	//also change threads size maping from tx by ty to 2tx x ty
-	size_t grid[3] = {(nx+tx*2-1)/(tx*2)*tx,(ny+ty-1)/ty*ty,1};
-	
-//	int sh_size = tx*2*ty*sizeof(float);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-	clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-	clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-	clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-//	clStatus = clSetKernelArg(clKernel,7,sh_size,NULL);
-	CHECK_ERROR("clSetKernelArg")
-
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-	int t;
-	for(t=0;t<iteration;t++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-
-    cl_mem d_temp =  d_A0;
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use tx by ty threads
+  int tx = 32;
+  int ty = 4;
+  size_t block[3] = {tx, ty, 1};
+
+  // also change threads size maping from tx by ty to 2tx x ty
+  size_t grid[3] = {(nx + tx * 2 - 1) / (tx * 2) * tx, (ny + ty - 1) / ty * ty,
+                    1};
+
+  //	int sh_size = tx*2*ty*sizeof(float);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  //	clStatus = clSetKernelArg(clKernel,7,sh_size,NULL);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int t;
+  for (t = 0; t < iteration; t++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+
+    cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-
-	}
-
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  }
 
-  cl_mem d_temp =  d_A0;
+  cl_mem d_temp = d_A0;
   d_A0 = d_Anext;
   d_Anext = d_temp;
 
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
-
-    	clStatus = clReleaseMemObject(d_A0);
-	clStatus = clReleaseMemObject(d_Anext);
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);
-	CHECK_ERROR("clReleaseContext")
- 
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Anext,nx,ny,nz);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		
-	//free((void*)clSource[0]);
-
-	free(h_A0);
-	free(h_Anext);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // free((void*)clSource[0]);
+
+  free(h_A0);
+  free(h_Anext);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h
index 3b3a473143ce69a8da5a9bc303371f074781415d..9729c0b9dd70b4958c6e6c45469c4030ea427bd5 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/common.h
@@ -8,6 +8,6 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #define TCF 4
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c
index f134bdc93350dd2c2abecb7e7f6d36c412d239b2..28c0e5fd7bf24ac79857b3488dc28f12b3c354df 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_strided/main.c
@@ -8,234 +8,235 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    cl_program clProgram;
-    cl_kernel clKernel;
-
-    pb_CreateAndBuildKernelFromBinary("build/opencl_base_strided_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
-    //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-    //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
-
-    //char clOptions[50];
-    //sprintf(clOptions,"-I src/opencl_base");
-    //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
-
-    //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256/TCF;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+TCF*tx-1)/(TCF*tx)*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  cl_program clProgram;
+  cl_kernel clKernel;
+
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_base_strided_default/kernel_offline.nvptx.s",
+      "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
+  // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
+
+  // char clOptions[50];
+  // sprintf(clOptions,"-I src/opencl_base");
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
+
+  // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256 / TCF;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + TCF * tx - 1) / (TCF * tx) * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  int t;
+  for (t = 0; t < iteration; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    // printf("iteration %d\n",t)
+    CHECK_ERROR("clEnqueueNDRangeKernel")
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    int t;
-    for(t=0; t<iteration; t++)
-    {
-        pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-        clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-        //printf("iteration %d\n",t)
-        CHECK_ERROR("clEnqueueNDRangeKernel")
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-        cl_mem d_temp = d_A0;
-        d_A0 = d_Anext;
-        d_Anext = d_temp;
-
-        pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-        clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-        clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    }
-
 
     cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    clStatus = clFinish(clCommandQueue);
-    CHECK_ERROR("clFinish")
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  }
+
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_PrintTimerSet(&timers);
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h
index 042bd64a23d897959a4145e6d2b42df76053e74c..12a6d131c29067073fa79f09c4e6f91b8662969c 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/common.h
@@ -10,6 +10,6 @@
 #define _COMMON_H_
 //#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
 // +3 for padding
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))+3)
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)) + 3)
 #define TCF 4
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c
index f387827795b21c00d382eea794f1ba823df05c20..f767f6a9d29094623296e012a6b2671954b0546a 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_base_vec/main.c
@@ -8,234 +8,235 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    cl_int clStatus;
-    cl_platform_id clPlatform;
-    clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    cl_program clProgram;
-    cl_kernel clKernel;
-
-    pb_CreateAndBuildKernelFromBinary("build/opencl_base_vec_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
-    //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-    //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    //CHECK_ERROR("clCreateProgramWithSource")
-
-    //char clOptions[50];
-    //sprintf(clOptions,"-I src/opencl_base");
-    //clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    //CHECK_ERROR("clBuildProgram")
-
-    //cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-    //CHECK_ERROR("clCreateKernel")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256/TCF;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+TCF*tx-1)/(TCF*tx)*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  cl_program clProgram;
+  cl_kernel clKernel;
+
+  pb_CreateAndBuildKernelFromBinary(
+      "build/opencl_base_vec_default/kernel_offline.nvptx.s", "naive_kernel",
+      &clContext, &clDevice, &clProgram, &clKernel);
+  // const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  // cl_program clProgram =
+  // clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  // CHECK_ERROR("clCreateProgramWithSource")
+
+  // char clOptions[50];
+  // sprintf(clOptions,"-I src/opencl_base");
+  // clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  // CHECK_ERROR("clBuildProgram")
+
+  // cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
+  // CHECK_ERROR("clCreateKernel")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256 / TCF;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + TCF * tx - 1) / (TCF * tx) * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  int t;
+  for (t = 0; t < iteration; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    // printf("iteration %d\n",t)
+    CHECK_ERROR("clEnqueueNDRangeKernel")
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    int t;
-    for(t=0; t<iteration; t++)
-    {
-        pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-        clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-        //printf("iteration %d\n",t)
-        CHECK_ERROR("clEnqueueNDRangeKernel")
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-        cl_mem d_temp = d_A0;
-        d_A0 = d_Anext;
-        d_Anext = d_temp;
-
-        pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-        clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-        clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    }
-
 
     cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    clStatus = clFinish(clCommandQueue);
-    CHECK_ERROR("clFinish")
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  }
+
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_PrintTimerSet(&timers);
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c
index 3456ba2a64d942ec20116ade4513e6f9abe888c5..10626bed59111d3ded3429626463966914218a5c 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu/main.c
@@ -8,239 +8,240 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlaformIDs")
-    
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(2,clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    /*cl_program clProgram;*/
-    /*cl_kernel clKernel;*/
-
-    /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/
-    const char* clSource[] = {readFile("src/opencl_cpu/kernel.cl")};
-    cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    CHECK_ERROR("clCreateProgramWithSource")
-
-    char clOptions[50];
-    sprintf(clOptions,"-I src/opencl_cpu");
-    clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    CHECK_ERROR("clBuildProgram")
-
-    cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(2, clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  /*cl_program clProgram;*/
+  /*cl_kernel clKernel;*/
+
+  /*pb_CreateAndBuildKernelFromBinary("build/opencl_base_default/kernel_offline.nvptx.s",
+   * "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/
+  const char *clSource[] = {readFile("src/opencl_cpu/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-I src/opencl_cpu");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  int t;
+  for (t = 0; t < iteration; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    // printf("iteration %d\n",t)
+    CHECK_ERROR("clEnqueueNDRangeKernel")
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    int t;
-    for(t=0; t<iteration; t++)
-    {
-        pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-        clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-        //printf("iteration %d\n",t)
-        CHECK_ERROR("clEnqueueNDRangeKernel")
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-        cl_mem d_temp = d_A0;
-        d_A0 = d_Anext;
-        d_Anext = d_temp;
-
-        pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-        clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-        clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    }
-
 
     cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    clStatus = clFinish(clCommandQueue);
-    CHECK_ERROR("clFinish")
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  }
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_PrintTimerSet(&timers);
+
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    //free((void*)clSource[0]);
+  // free((void*)clSource[0]);
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c
index 9d7457c39388f7652d53d7cd1b4872676b2831ae..1d03111f209173dfc2462cb274e1bb0ac56e9c8c 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_baseline/main.c
@@ -8,237 +8,238 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlaformIDs")
-    
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    /*cl_program clProgram;*/
-    /*cl_kernel clKernel;*/
-
-    /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);*/
-    const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-    cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-    CHECK_ERROR("clCreateProgramWithSource")
-
-    char clOptions[50];
-    sprintf(clOptions,"-I src/opencl_base");
-    clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-    CHECK_ERROR("clBuildProgram")
-
-    cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
-    /*printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);*/
-
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    int t;
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    /*for(int i=0; i<1; i++) {*/
-      for(t=0; t<iteration; t++)
-      {
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-          //printf("iteration %d\n",t)
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-
-          cl_mem d_temp = d_A0;
-          d_A0 = d_Anext;
-          d_Anext = d_temp;
-
-          clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-          clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-
-      }
-    /*}*/
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-    /*clStatus = clFinish(clCommandQueue);*/
-    /*pb_SwitchToTimer(&timers, pb_TimerID_NONE);*/
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  /*cl_program clProgram;*/
+  /*cl_kernel clKernel;*/
+
+  /*pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext,
+   * &clDevice, &clProgram, &clKernel);*/
+  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-I src/opencl_base");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram, "naive_kernel", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+  /*printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1],
+   * grid[2], block[0], block[1], block[2]);*/
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  int t;
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  /*for(int i=0; i<1; i++) {*/
+  for (t = 0; t < iteration; t++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    // printf("iteration %d\n",t)
+    CHECK_ERROR("clEnqueueNDRangeKernel")
 
     cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
 
-    clStatus = clFinish(clCommandQueue);
-    CHECK_ERROR("clFinish")
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  }
+  /*}*/
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  /*clStatus = clFinish(clCommandQueue);*/
+  /*pb_SwitchToTimer(&timers, pb_TimerID_NONE);*/
 
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    /*free((void*)clSource[0]);*/
+  /*free((void*)clSource[0]);*/
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c
index c9868918043b6cf815a823bd99ac12b62447209e..cf86734a8639ce38eb2b1ac8280582e7bde4531c 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_default/main.c
@@ -8,245 +8,248 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlaformIDs")
-    
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    cl_program clProgram;
-    cl_kernel clKernel;
-
-    pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
-    /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/
-    /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
-    /*CHECK_ERROR("clCreateProgramWithSource")*/
-
-    /*char clOptions[50];*/
-    /*sprintf(clOptions,"-I src/opencl_base");*/
-    /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
-    /*CHECK_ERROR("clBuildProgram")*/
-
-    /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/
-    /*CHECK_ERROR("clCreateKernel")*/
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
-    printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    int t;
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for(int i=0; i<2; i++) {
-      for(t=0; t<iteration; t++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-          //printf("iteration %d\n",t)
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-          cl_mem d_temp = d_A0;
-          d_A0 = d_Anext;
-          d_Anext = d_temp;
-
-          /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/
-          clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-          clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-      }
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  cl_program clProgram;
+  cl_kernel clKernel;
+
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext,
+                                    &clDevice, &clProgram, &clKernel);
+  /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/
+  /*cl_program clProgram =
+   * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
+  /*CHECK_ERROR("clCreateProgramWithSource")*/
+
+  /*char clOptions[50];*/
+  /*sprintf(clOptions,"-I src/opencl_base");*/
+  /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
+  /*CHECK_ERROR("clBuildProgram")*/
+
+  /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/
+  /*CHECK_ERROR("clCreateKernel")*/
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+  printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1],
+         grid[2], block[0], block[1], block[2]);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  int t;
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int i = 0; i < 2; i++) {
+    for (t = 0; t < iteration; t++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                        block, 0, NULL, NULL);
+      // printf("iteration %d\n",t)
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+      cl_mem d_temp = d_A0;
+      d_A0 = d_Anext;
+      d_Anext = d_temp;
+
+      /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/
+      clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+      clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
+  }
 
-    clStatus = clFinish(clCommandQueue);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    cl_mem d_temp = d_A0;
-    d_A0 = d_Anext;
-    d_Anext = d_temp;
+  clStatus = clFinish(clCommandQueue);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    /*clStatus = clFinish(clCommandQueue);*/
-    /*CHECK_ERROR("clFinish")*/
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  /*clStatus = clFinish(clCommandQueue);*/
+  /*CHECK_ERROR("clFinish")*/
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_PrintTimerSet(&timers);
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    /*free((void*)clSource[0]);*/
+  /*free((void*)clSource[0]);*/
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c
index 88fc557e6e7f7e9796e081d1dd63e52aea102ac8..3b009e370e284a5b5b705bcc3a8122547a83c177 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_cpu_large/main.c
@@ -8,245 +8,248 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-
-    //declaration
-    unsigned nx,ny,nz;
-    unsigned size;
-    int iteration;
-    float c0=1.0f/6.0f;
-    float c1=1.0f/6.0f/6.0f;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
- 
-    pb_InitializeTimerSet(&timers);
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    cl_int clStatus;
-
-    cl_uint numPlatforms;
-    clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
-    CHECK_ERROR("clGetPlaformIDs")
-    
-    cl_platform_id clPlatform[numPlatforms];
-    clStatus = clGetPlatformIDs(numPlatforms,clPlatform,NULL);
-    CHECK_ERROR("clGetPlaformIDs")
-
-    cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-
-    cl_device_id clDevice;
-    clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
-    CHECK_ERROR("clGetDeviceIDs")
-
-    cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_CPU,NULL,NULL,&clStatus);
-    CHECK_ERROR("clCreateContextFromType")
-
-    cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-    CHECK_ERROR("clCreateCommandQueue")
-
-    pb_SetOpenCL(&clContext, &clCommandQueue);
-
-    cl_program clProgram;
-    cl_kernel clKernel;
-
-    pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext, &clDevice, &clProgram, &clKernel);
-    /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/
-    /*cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
-    /*CHECK_ERROR("clCreateProgramWithSource")*/
-
-    /*char clOptions[50];*/
-    /*sprintf(clOptions,"-I src/opencl_base");*/
-    /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
-    /*CHECK_ERROR("clBuildProgram")*/
-
-    /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/
-    /*CHECK_ERROR("clCreateKernel")*/
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //device
-    cl_mem d_A0;
-    cl_mem d_Anext;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-
-    //memory allocation
-    d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    //memory copy
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    //only use 1D thread block
-    unsigned tx =256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-//  printf("block x is %d and y is %d z \n",block[0],block[1]);
-//  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
-    printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-    clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-    clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-    clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-    clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-    CHECK_ERROR("clSetKernelArg")
-
-    //main execution
-    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    int t;
-    pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
-    for(int i=0; i<1; i++) {
-      for(t=0; t<iteration; t++)
-      {
-          /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
-          clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-          //printf("iteration %d\n",t)
-          CHECK_ERROR("clEnqueueNDRangeKernel")
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-          cl_mem d_temp = d_A0;
-          d_A0 = d_Anext;
-          d_Anext = d_temp;
-
-          /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/
-          clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-          clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-          /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  // declaration
+  unsigned nx, ny, nz;
+  unsigned size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-      }
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  cl_int clStatus;
+
+  cl_uint numPlatforms;
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_platform_id clPlatform[numPlatforms];
+  clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+
+  cl_device_id clDevice;
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  cl_program clProgram;
+  cl_kernel clKernel;
+
+  pb_CreateAndBuildKernelFromBinary("kernel.ir", "naive_kernel", &clContext,
+                                    &clDevice, &clProgram, &clKernel);
+  /*const char* clSource[] = {readFile("src/opencl_base/kernel_offline.cl")};*/
+  /*cl_program clProgram =
+   * clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);*/
+  /*CHECK_ERROR("clCreateProgramWithSource")*/
+
+  /*char clOptions[50];*/
+  /*sprintf(clOptions,"-I src/opencl_base");*/
+  /*clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);*/
+  /*CHECK_ERROR("clBuildProgram")*/
+
+  /*cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);*/
+  /*CHECK_ERROR("clCreateKernel")*/
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use 1D thread block
+  unsigned tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {(nx - 2 + tx - 1) / tx * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+  //  printf("block x is %d and y is %d z \n",block[0],block[1]);
+  //  printf("grid x is %d and y is %d\n",grid[0],grid[1]);
+  printf("grid(%lu, %lu, %lu), block(%lu, %lu, %lu)\n", grid[0], grid[1],
+         grid[2], block[0], block[1], block[2]);
+
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  int t;
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+  for (int i = 0; i < 1; i++) {
+    for (t = 0; t < iteration; t++) {
+      /*pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);*/
+      clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                        block, 0, NULL, NULL);
+      // printf("iteration %d\n",t)
+      CHECK_ERROR("clEnqueueNDRangeKernel")
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+      cl_mem d_temp = d_A0;
+      d_A0 = d_Anext;
+      d_Anext = d_temp;
+
+      /*pb_SwitchToTimer(&timers, visc_TimerID_SETUP);*/
+      clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+      clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+      /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
     }
+  }
 
-    clStatus = clFinish(clCommandQueue);
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-    cl_mem d_temp = d_A0;
-    d_A0 = d_Anext;
-    d_Anext = d_temp;
+  clStatus = clFinish(clCommandQueue);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    /*clStatus = clFinish(clCommandQueue);*/
-    /*CHECK_ERROR("clFinish")*/
+  cl_mem d_temp = d_A0;
+  d_A0 = d_Anext;
+  d_Anext = d_temp;
 
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueReadBuffer")
+  /*clStatus = clFinish(clCommandQueue);*/
+  /*CHECK_ERROR("clFinish")*/
 
-    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-    clStatus = clReleaseMemObject(d_A0);
-    clStatus = clReleaseMemObject(d_Anext);
-    clStatus = clReleaseKernel(clKernel);
-    clStatus = clReleaseProgram(clProgram);
-    clStatus = clReleaseCommandQueue(clCommandQueue);
-    clStatus = clReleaseContext(clContext);
-    CHECK_ERROR("clReleaseContext")
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
 
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_PrintTimerSet(&timers);
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
 
-    /*free((void*)clSource[0]);*/
+  /*free((void*)clSource[0]);*/
 
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
 
-    return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc
index c690d13171488dbb05c3e707639373d2e89bf18d..4f8b0e2660704b90733827d55f835d408ef2b0da 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.cc
@@ -7,64 +7,56 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
-
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
-
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
+
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
+
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
+
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
+
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h
index 124fef8e655943e5e135fc4f189e9b70c552ce40..daf3b5e161194f2e2fda4c336651cbde7d1dee27 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/file.h
@@ -13,7 +13,8 @@
 extern "C"
 #endif
 
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+    void
+    outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c
index 76d651402d873ab0f9be9fcd28138024f64e3e3c..7116ca362087b6f95f99f8e2e0a9af1fbe0ddd24 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_fermi/main.c
@@ -8,215 +8,215 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) 
-{	
-	int s=0;
-	int i,j,k;
-	for(i=0;i<nz;i++)
-	{
-		for(j=0;j<ny;j++)
-		{
-			for(k=0;k<nx;k++)
-			{
-                                fread(A0+s,sizeof(float),1,fp);
-				s++;
-			}
-		}
-	}
-	return 0;
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
+    }
+  }
+  return 0;
 }
 
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // declaration
+  int nx, ny, nz;
+  int size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time");
+    return -1;
+  }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated 7 points stencil codes****\n");
-	printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//declaration
-	int nx,ny,nz;
-	int size;
- 	int iteration;
-	float c0=1.0f/6.0f;
-	float c1=1.0f/6.0f/6.0f;
-
-	if (argc<5) 
-    	{
-	     printf("Usage: probe nx ny nz t\n"
-	     "nx: the grid size x\n"
-	     "ny: the grid size y\n"
-	     "nz: the grid size z\n"
-	     "t: the iteration time");
-	     return -1;
-	}
-
-	nx = atoi(argv[1]);
-	if (nx<1)
-		return -1;
-	ny = atoi(argv[2]);
-	if (ny<1)
-		return -1;
-	nz = atoi(argv[3]);
-	if (nz<1)
-		return -1;
-	iteration = atoi(argv[4]);
-	if(iteration<1)
-		return -1;
-	
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlaformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-	const char* clSource[] = {readFile("src/opencl_fermi/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"-I src/opencl_fermi");
-	clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-
-	cl_kernel clKernel = clCreateKernel(clProgram,"block2D_reg_tiling",&clStatus);
-	CHECK_ERROR("clCreateKernel") 			
-
-	//host data
-	float *h_A0;
-	float *h_Anext;
-	
-	//device
-	cl_mem d_A0;
-	cl_mem d_Anext;
-
-	//load data from files
-	size=nx*ny*nz;
-	
-	h_A0=(float*)malloc(sizeof(float)*size);
-	h_Anext=(float*)malloc(sizeof(float)*size);
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  const char *clSource[] = {readFile("src/opencl_fermi/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-I src/opencl_fermi");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel =
+      clCreateKernel(clProgram, "block2D_reg_tiling", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  // load data from files
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
   FILE *fp = fopen(parameters->inpFiles[0], "rb");
-	read_data(h_A0, nx,ny,nz,fp);
+  read_data(h_A0, nx, ny, nz, fp);
   fclose(fp);
-  memcpy (h_Anext,h_A0,sizeof(float)*size);
- 
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//memory allocation
-	d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")	
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	//only use tx-by-ty threads
-	size_t tx = 512;
-	size_t ty = 2;
-	size_t block[3] = {tx,ty,1};
-	size_t grid[3] = {(nx+tx-1)/tx*block[0],(ny+ty-1)/ty*block[1],1};
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-	clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-	clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-	clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-	CHECK_ERROR("clSetKernelArg")
-
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-	int t;
-	for(t=0;t<iteration;t++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use tx-by-ty threads
+  size_t tx = 512;
+  size_t ty = 2;
+  size_t block[3] = {tx, ty, 1};
+  size_t grid[3] = {(nx + tx - 1) / tx * block[0],
+                    (ny + ty - 1) / ty * block[1], 1};
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int t;
+  for (t = 0; t < iteration; t++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
     cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-
-	}
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  }
 
   cl_mem d_temp = d_A0;
   d_A0 = d_Anext;
   d_Anext = d_temp;
 
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_A0);
+  free(h_Anext);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
 
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
-
- 	clStatus = clReleaseMemObject(d_A0);
-	clStatus = clReleaseMemObject(d_Anext);
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);
-	CHECK_ERROR("clReleaseContext")
- 
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Anext,nx,ny,nz);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		
-	free((void*)clSource[0]);
-
-	free(h_A0);
-	free(h_Anext);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h
index 85c998198e9ad26c4ac912439c533ec9ca4d7ada..0d2e87b0f14004d71ecedc86e822b0fdde8d6252 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/file.h
@@ -13,9 +13,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c
index d5ff1e913a6f4a098b563e8d24fa1cf2550ecf8c..526666c45c10a077407e6162498fcef8fd4159c2 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/opencl_nvidia/main.c
@@ -8,225 +8,227 @@
  ***************************************************************************/
 
 #include <CL/cl.h>
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>i
 #include <string.h>
-#include <parboil.h>
 
-#include "file.h"
 #include "common.h"
+#include "file.h"
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) 
-{	
-	int s=0;
-	int i,j,k;
-	for(i=0;i<nz;i++)
-	{
-		for(j=0;j<ny;j++)
-		{
-			for(k=0;k<nx;k++)
-			{
-                                fread(A0+s,sizeof(float),1,fp);
-				s++;
-			}
-		}
-	}
-	return 0;
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
+    }
+  }
+  return 0;
 }
 
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // declaration
+  int nx, ny, nz;
+  int size;
+  int iteration;
+  float c0 = 1.0f / 6.0f;
+  float c1 = 1.0f / 6.0f / 6.0f;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
 
-int main(int argc, char** argv) {
-	struct pb_TimerSet timers;
-	struct pb_Parameters *parameters;
-	
-	printf("OpenCL accelerated 7 points stencil codes****\n");
-	printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-	parameters = pb_ReadParameters(&argc, argv);
-
-	pb_InitializeTimerSet(&timers);
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	
-	//declaration
-	int nx,ny,nz;
-	int size;
-	int iteration;
-	float c0=1.0f/6.0f;
-	float c1=1.0f/6.0f/6.0f;
-
-	if (argc<5) 
-    	{
-	     printf("Usage: probe nx ny nz t\n"
-	     "nx: the grid size x\n"
-	     "ny: the grid size y\n"
-	     "nz: the grid size z\n"
-	     "t: the iteration time\n");
-	     return -1;
-	}
-
-	nx = atoi(argv[1]);
-	if (nx<1)
-		return -1;
-	ny = atoi(argv[2]);
-	if (ny<1)
-		return -1;
-	nz = atoi(argv[3]);
-	if (nz<1)
-		return -1;
-	iteration = atoi(argv[4]);
-	if(iteration<1)
-		return -1;
-	
-	cl_int clStatus;
-	cl_platform_id clPlatform;
-	clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
-	CHECK_ERROR("clGetPlaformIDs")
-
-	cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-	
-	cl_device_id clDevice;
-	clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
-	CHECK_ERROR("clGetDeviceIDs")
-
-	cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
-
-	cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
-
-  	pb_SetOpenCL(&clContext, &clCommandQueue);
-
-	const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-	cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
-
-	char clOptions[50];
-	sprintf(clOptions,"-I src/opencl_nvidia");
-	clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-
-	cl_kernel clKernel = clCreateKernel(clProgram,"block2D_hybrid_coarsen_x",&clStatus);
-	CHECK_ERROR("clCreateKernel") 			
-
-        // get local memory size [can be removed]
-        cl_ulong local_mem_size;
-        clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_mem_size, 0);
-        printf("Scratchpad Size = %lu\n", local_mem_size);
-        
-	//host data
-	float *h_A0;
-	float *h_Anext;
-	
-	//device
-	cl_mem d_A0;
-	cl_mem d_Anext;
-
-	//load data from files
-	size=nx*ny*nz;
-	
-	h_A0=(float*)malloc(sizeof(float)*size);
-	h_Anext=(float*)malloc(sizeof(float)*size);
-	pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  cl_int clStatus;
+  cl_platform_id clPlatform;
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
+  CHECK_ERROR("clGetPlaformIDs")
+
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+
+  cl_device_id clDevice;
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
+  CHECK_ERROR("clGetDeviceIDs")
+
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
+  CHECK_ERROR("clCreateContextFromType")
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+
+  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions, "-I src/opencl_nvidia");
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel =
+      clCreateKernel(clProgram, "block2D_hybrid_coarsen_x", &clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  // get local memory size [can be removed]
+  cl_ulong local_mem_size;
+  clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong),
+                  &local_mem_size, 0);
+  printf("Scratchpad Size = %lu\n", local_mem_size);
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  // device
+  cl_mem d_A0;
+  cl_mem d_Anext;
+
+  // load data from files
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
   FILE *fp = fopen(parameters->inpFiles[0], "rb");
-	read_data(h_A0, nx,ny,nz,fp);
+  read_data(h_A0, nx, ny, nz, fp);
   fclose(fp);
-  memcpy (h_Anext,h_A0,sizeof(float)*size);
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	
-	//memory allocation
-	d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")
-	d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
-	CHECK_ERROR("clCreateBuffer")	
-	
-	//memory copy
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-	//only use tx by ty threads
-	int tx = 32;
-	int ty = 4;
-	size_t block[3] = {tx,ty,1};
-	
-	//also change threads size maping from tx by ty to 2tx x ty
-	size_t grid[3] = {(nx+tx*2-1)/(tx*2)*tx,(ny+ty-1)/ty*ty,1};
-	
-	int sh_size = tx*2*ty*sizeof(float);
-        printf("Scratchpad Size Required = %d\n", sh_size);
-
-	clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
-	clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
-	clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-	clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-	clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx);
-	clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny);
-	clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz);
-	clStatus = clSetKernelArg(clKernel,7,sh_size,NULL);
-	CHECK_ERROR("clSetKernelArg")
-
-	//main execution
-	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-	int t;
-	for(t=0;t<iteration;t++)
-	{
-		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-		CHECK_ERROR("clEnqueueNDRangeKernel")
-
-    cl_mem d_temp =  d_A0;
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
+  // memory allocation
+  d_A0 = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                        NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  d_Anext = clCreateBuffer(clContext, CL_MEM_READ_WRITE, size * sizeof(float),
+                           NULL, &clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  // memory copy
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_A0, CL_FALSE, 0,
+                                  size * sizeof(float), h_A0, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                  size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  // only use tx by ty threads
+  int tx = 32;
+  int ty = 4;
+  size_t block[3] = {tx, ty, 1};
+
+  // also change threads size maping from tx by ty to 2tx x ty
+  size_t grid[3] = {(nx + tx * 2 - 1) / (tx * 2) * tx, (ny + ty - 1) / ty * ty,
+                    1};
+
+  int sh_size = tx * 2 * ty * sizeof(float);
+  printf("Scratchpad Size Required = %d\n", sh_size);
+
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(float), (void *)&c0);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(float), (void *)&c1);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), (void *)&nx);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ny);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nz);
+  clStatus = clSetKernelArg(clKernel, 7, sh_size, NULL);
+  CHECK_ERROR("clSetKernelArg")
+
+  // main execution
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+
+  int t;
+  for (t = 0; t < iteration; t++) {
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 3, NULL, grid,
+                                      block, 0, NULL, NULL);
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+
+    cl_mem d_temp = d_A0;
     d_A0 = d_Anext;
     d_Anext = d_temp;
-    clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0);
-    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
-
-	}
-
+    clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_A0);
+    clStatus = clSetKernelArg(clKernel, 3, sizeof(cl_mem), (void *)&d_Anext);
+  }
 
-  cl_mem d_temp =  d_A0;
+  cl_mem d_temp = d_A0;
   d_A0 = d_Anext;
   d_Anext = d_temp;
 
-	clStatus = clFinish(clCommandQueue);
-	CHECK_ERROR("clFinish")
-
-	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueReadBuffer")
-
-    	clStatus = clReleaseMemObject(d_A0);
-	clStatus = clReleaseMemObject(d_Anext);
-	clStatus = clReleaseKernel(clKernel);
-	clStatus = clReleaseProgram(clProgram);
-	clStatus = clReleaseCommandQueue(clCommandQueue);
-	clStatus = clReleaseContext(clContext);
-	CHECK_ERROR("clReleaseContext")
- 
-	if (parameters->outFile) {
-		pb_SwitchToTimer(&timers, pb_TimerID_IO);
-		outputData(parameters->outFile,h_Anext,nx,ny,nz);
-		
-	}
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		
-	free((void*)clSource[0]);
-
-	free(h_A0);
-	free(h_Anext);
-	pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-
-	pb_PrintTimerSet(&timers);
-	pb_FreeParameters(parameters);
-
-	return 0;
+  clStatus = clFinish(clCommandQueue);
+  CHECK_ERROR("clFinish")
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_Anext, CL_TRUE, 0,
+                                 size * sizeof(float), h_Anext, 0, NULL, NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  clStatus = clReleaseMemObject(d_A0);
+  clStatus = clReleaseMemObject(d_Anext);
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+  CHECK_ERROR("clReleaseContext")
+
+  if (parameters->outFile) {
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  free((void *)clSource[0]);
+
+  free(h_A0);
+  free(h_Anext);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h b/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h
index 33bb06d5bd7e02e009565688882ed4e0ef2d52d4..1a682890b3619ef712c5e5e3a7313e325935ec6f 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/common.h
@@ -8,5 +8,5 @@
 
 #ifndef _COMMON_H_
 #define _COMMON_H_
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)))
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc
index 4c57469f7a4b1886f14be77a373750e1a7635cbe..95cd65c4a0e013c60c6edd43077346a7efdad1ae 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.cc
@@ -7,81 +7,70 @@
  ***************************************************************************/
 
 #include <endian.h>
-#include <stdlib.h>
+#include <inttypes.h>
 #include <malloc.h>
 #include <stdio.h>
-#include <inttypes.h>
+#include <stdlib.h>
 
 #if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
+#error "File I/O is not implemented for this system: wrong endianness."
 #endif
 
-extern "C"
-void inputData(char* fName, int* nx, int* ny, int* nz)
-{
-  FILE* fid = fopen(fName, "r");
+extern "C" void inputData(char *fName, int *nx, int *ny, int *nz) {
+  FILE *fid = fopen(fName, "r");
+
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open input file\n");
+    exit(-1);
+  }
 
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-	
-  fread(nx, sizeof(int ),1,fid);
-  fread(ny, sizeof(int ),1,fid);
-  fread(nz, sizeof(int ),1,fid);
-  fclose (fid); 
+  fread(nx, sizeof(int), 1, fid);
+  fread(ny, sizeof(int), 1, fid);
+  fread(nz, sizeof(int), 1, fid);
+  fclose(fid);
 }
 
-extern "C"
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz)
-{
-  FILE* fid = fopen(fName, "w");
+extern "C" void outputData(char *fName, float *h_A0, int nx, int ny, int nz) {
+  FILE *fid = fopen(fName, "w");
   uint32_t tmp32;
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-  tmp32 = nx*ny*nz;
+  if (fid == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(-1);
+  }
+  tmp32 = nx * ny * nz;
   fwrite(&tmp32, sizeof(uint32_t), 1, fid);
   fwrite(h_A0, sizeof(float), tmp32, fid);
 
-  fclose (fid);
+  fclose(fid);
 }
 
-extern "C"
-char* readFile(const char* fileName)
-  {
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
+extern "C" char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error 1!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error 2!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error 3!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h
index 40c69734802ba06297418a3895d6eebd7af7b29b..b45c42371bbde3c3a39d88277adf39a8f537baab 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/file.h
@@ -12,9 +12,9 @@
 extern "C" {
 #endif
 
-void inputData(char* fName, int* nx, int* ny, int* nz);
-void outputData(char* fName, float *h_A0,int nx,int ny,int nz);
-char* readFile(const char* fileName);
+void inputData(char *fName, int *nx, int *ny, int *nz);
+void outputData(char *fName, float *h_A0, int nx, int ny, int nz);
+char *readFile(const char *fileName);
 
 #ifdef __cplusplus
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
index 9ecba96aed5642a4babaea8667576c25c1e4fb1f..5672a3ee490917d1374783eae5ab0ba1956ef441 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc/stencil.cpp
@@ -7,283 +7,276 @@
  *cr
  ***************************************************************************/
 
+#include "common.h"
+#include "file.h"
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 #include <visc.h>
-#include "file.h"
-#include "common.h"
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
 typedef struct __attribute__((__packed__)) {
-    float c0, c1;
-    float* A0; size_t bytes_A0;
-    float* Anext; size_t bytes_Anext;
-    int nx, ny, nz;
-    size_t dim_X1, dim_Y1, dim_Z1;
-    size_t dim_X2, dim_Y2, dim_Z2;
+  float c0, c1;
+  float *A0;
+  size_t bytes_A0;
+  float *Anext;
+  size_t bytes_Anext;
+  int nx, ny, nz;
+  size_t dim_X1, dim_Y1, dim_Z1;
+  size_t dim_X2, dim_Y2, dim_Z2;
 } RootIn;
 
-void naive_kernel(float c0, float c1, 
-                  float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
-                  int nx, int ny, int nz) 
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(2, A0, Anext, 1, Anext);
-
-    void* thisNode = __visc__getNode();
-    void* parentNode = __visc__getParentNode(thisNode);
-
-    int lx = __visc__getNodeInstanceID_x(thisNode);
-    int ly = __visc__getNodeInstanceID_y(thisNode);
-    int lz = __visc__getNodeInstanceID_z(thisNode);
-
-    int gx = __visc__getNodeInstanceID_x(parentNode);
-    int gy = __visc__getNodeInstanceID_y(parentNode);
-    int gz = __visc__getNodeInstanceID_z(parentNode);
-    
-    int gridx = __visc__getNumNodeInstances_x(thisNode);
-    int gridy = __visc__getNumNodeInstances_y(thisNode);
-    int gridz = __visc__getNumNodeInstances_z(thisNode);
-
-    int i = gx * gridx + lx + 1;
-    int j = gy * gridy + ly + 1;
-    int k = gz * gridz + lz + 1;
-    
-    if(i<nx-1)
-    {
-        Anext[Index3D (nx, ny, i, j, k)] = c1 *
-                                           ( A0[Index3D (nx, ny, i, j, k + 1)] +
-                                             A0[Index3D (nx, ny, i, j, k - 1)] +
-                                             A0[Index3D (nx, ny, i, j + 1, k)] +
-                                             A0[Index3D (nx, ny, i, j - 1, k)] +
-                                             A0[Index3D (nx, ny, i + 1, j, k)] +
-                                             A0[Index3D (nx, ny, i - 1, j, k)] )
-                                           - A0[Index3D (nx, ny, i, j, k)] * c0;
-    }
+void naive_kernel(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
+                  size_t bytes_Anext, int nx, int ny, int nz) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, A0, Anext, 1, Anext);
+
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
+
+  int lx = __visc__getNodeInstanceID_x(thisNode);
+  int ly = __visc__getNodeInstanceID_y(thisNode);
+  int lz = __visc__getNodeInstanceID_z(thisNode);
+
+  int gx = __visc__getNodeInstanceID_x(parentNode);
+  int gy = __visc__getNodeInstanceID_y(parentNode);
+  int gz = __visc__getNodeInstanceID_z(parentNode);
+
+  int gridx = __visc__getNumNodeInstances_x(thisNode);
+  int gridy = __visc__getNumNodeInstances_y(thisNode);
+  int gridz = __visc__getNumNodeInstances_z(thisNode);
+
+  int i = gx * gridx + lx + 1;
+  int j = gy * gridy + ly + 1;
+  int k = gz * gridz + lz + 1;
+
+  if (i < nx - 1) {
+    Anext[Index3D(nx, ny, i, j, k)] = c1 * (A0[Index3D(nx, ny, i, j, k + 1)] +
+                                            A0[Index3D(nx, ny, i, j, k - 1)] +
+                                            A0[Index3D(nx, ny, i, j + 1, k)] +
+                                            A0[Index3D(nx, ny, i, j - 1, k)] +
+                                            A0[Index3D(nx, ny, i + 1, j, k)] +
+                                            A0[Index3D(nx, ny, i - 1, j, k)]) -
+                                      A0[Index3D(nx, ny, i, j, k)] * c0;
+  }
 }
 
-void stencilLvl1(float c0, float c1, 
-                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
-                 int nx, int ny, int nz,
-                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1) 
-{
-    __visc__hint(visc::DEVICE);
-    __visc__attributes(2, A0, Anext, 1, Anext);
-    void* stencil_node = __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
-    __visc__bindIn(stencil_node, 0, 0, 0);
-    __visc__bindIn(stencil_node, 1, 1, 0);
-    __visc__bindIn(stencil_node, 2, 2, 0);
-    __visc__bindIn(stencil_node, 3, 3, 0);
-    __visc__bindIn(stencil_node, 4, 4, 0);
-    __visc__bindIn(stencil_node, 5, 5, 0);
-    __visc__bindIn(stencil_node, 6, 6, 0);
-    __visc__bindIn(stencil_node, 7, 7, 0);
-    __visc__bindIn(stencil_node, 8, 8, 0);
+void stencilLvl1(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
+                 size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
+                 size_t dim_Y1, size_t dim_Z1) {
+  __visc__hint(visc::DEVICE);
+  __visc__attributes(2, A0, Anext, 1, Anext);
+  void *stencil_node =
+      __visc__createNodeND(3, naive_kernel, dim_X1, dim_Y1, dim_Z1);
+  __visc__bindIn(stencil_node, 0, 0, 0);
+  __visc__bindIn(stencil_node, 1, 1, 0);
+  __visc__bindIn(stencil_node, 2, 2, 0);
+  __visc__bindIn(stencil_node, 3, 3, 0);
+  __visc__bindIn(stencil_node, 4, 4, 0);
+  __visc__bindIn(stencil_node, 5, 5, 0);
+  __visc__bindIn(stencil_node, 6, 6, 0);
+  __visc__bindIn(stencil_node, 7, 7, 0);
+  __visc__bindIn(stencil_node, 8, 8, 0);
 }
 
-void stencilLvl2(float c0, float c1, 
-                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
-                 int nx, int ny, int nz,
-                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1,
-                 size_t dim_X2, size_t dim_Y2, size_t dim_Z2)
-{
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(2, A0, Anext, 1, Anext);
-    void* stencil_node = __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
-    __visc__bindIn(stencil_node, 0, 0, 0);
-    __visc__bindIn(stencil_node, 1, 1, 0);
-    __visc__bindIn(stencil_node, 2, 2, 0);
-    __visc__bindIn(stencil_node, 3, 3, 0);
-    __visc__bindIn(stencil_node, 4, 4, 0);
-    __visc__bindIn(stencil_node, 5, 5, 0);
-    __visc__bindIn(stencil_node, 6, 6, 0);
-    __visc__bindIn(stencil_node, 7, 7, 0);
-    __visc__bindIn(stencil_node, 8, 8, 0);
-    __visc__bindIn(stencil_node, 9, 9, 0);
-    __visc__bindIn(stencil_node, 10, 10, 0);
-    __visc__bindIn(stencil_node, 11, 11, 0);
+void stencilLvl2(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
+                 size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
+                 size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2,
+                 size_t dim_Z2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, A0, Anext, 1, Anext);
+  void *stencil_node =
+      __visc__createNodeND(3, stencilLvl1, dim_X2, dim_Y2, dim_Z2);
+  __visc__bindIn(stencil_node, 0, 0, 0);
+  __visc__bindIn(stencil_node, 1, 1, 0);
+  __visc__bindIn(stencil_node, 2, 2, 0);
+  __visc__bindIn(stencil_node, 3, 3, 0);
+  __visc__bindIn(stencil_node, 4, 4, 0);
+  __visc__bindIn(stencil_node, 5, 5, 0);
+  __visc__bindIn(stencil_node, 6, 6, 0);
+  __visc__bindIn(stencil_node, 7, 7, 0);
+  __visc__bindIn(stencil_node, 8, 8, 0);
+  __visc__bindIn(stencil_node, 9, 9, 0);
+  __visc__bindIn(stencil_node, 10, 10, 0);
+  __visc__bindIn(stencil_node, 11, 11, 0);
 }
 
-void stencilLvl3(float c0, float c1, 
-                 float* A0, size_t bytes_A0, float* Anext, size_t bytes_Anext, 
-                 int nx, int ny, int nz,
-                 size_t dim_X1, size_t dim_Y1, size_t dim_Z1,
-                 size_t dim_X2, size_t dim_Y2, size_t dim_Z2)
-{
-    __visc__hint(visc::CPU_TARGET);
-    __visc__attributes(2, A0, Anext, 1, Anext);
-    void* stencil_node = __visc__createNodeND(0, stencilLvl2);
-    __visc__bindIn(stencil_node, 0, 0, 0);
-    __visc__bindIn(stencil_node, 1, 1, 0);
-    __visc__bindIn(stencil_node, 2, 2, 0);
-    __visc__bindIn(stencil_node, 3, 3, 0);
-    __visc__bindIn(stencil_node, 4, 4, 0);
-    __visc__bindIn(stencil_node, 5, 5, 0);
-    __visc__bindIn(stencil_node, 6, 6, 0);
-    __visc__bindIn(stencil_node, 7, 7, 0);
-    __visc__bindIn(stencil_node, 8, 8, 0);
-    __visc__bindIn(stencil_node, 9, 9, 0);
-    __visc__bindIn(stencil_node, 10, 10, 0);
-    __visc__bindIn(stencil_node, 11, 11, 0);
-    __visc__bindIn(stencil_node, 12, 12, 0);
-    __visc__bindIn(stencil_node, 13, 13, 0);
-    __visc__bindIn(stencil_node, 14, 14, 0);
+void stencilLvl3(float c0, float c1, float *A0, size_t bytes_A0, float *Anext,
+                 size_t bytes_Anext, int nx, int ny, int nz, size_t dim_X1,
+                 size_t dim_Y1, size_t dim_Z1, size_t dim_X2, size_t dim_Y2,
+                 size_t dim_Z2) {
+  __visc__hint(visc::CPU_TARGET);
+  __visc__attributes(2, A0, Anext, 1, Anext);
+  void *stencil_node = __visc__createNodeND(0, stencilLvl2);
+  __visc__bindIn(stencil_node, 0, 0, 0);
+  __visc__bindIn(stencil_node, 1, 1, 0);
+  __visc__bindIn(stencil_node, 2, 2, 0);
+  __visc__bindIn(stencil_node, 3, 3, 0);
+  __visc__bindIn(stencil_node, 4, 4, 0);
+  __visc__bindIn(stencil_node, 5, 5, 0);
+  __visc__bindIn(stencil_node, 6, 6, 0);
+  __visc__bindIn(stencil_node, 7, 7, 0);
+  __visc__bindIn(stencil_node, 8, 8, 0);
+  __visc__bindIn(stencil_node, 9, 9, 0);
+  __visc__bindIn(stencil_node, 10, 10, 0);
+  __visc__bindIn(stencil_node, 11, 11, 0);
+  __visc__bindIn(stencil_node, 12, 12, 0);
+  __visc__bindIn(stencil_node, 13, 13, 0);
+  __visc__bindIn(stencil_node, 14, 14, 0);
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-    //declaration
-    int nx,ny,nz;
-    size_t size;
-    int iteration;
-    float c0=1.0/6.0;
-    float c1=1.0/6.0/6.0;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-
-    //load data from files
-
-    size=nx*ny*nz;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0, nx,ny,nz,fp);
-    fclose(fp);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(h_A0, sizeof(float)*size);
-    llvm_visc_track_mem(h_Anext, sizeof(float)*size);
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+  // declaration
+  int nx, ny, nz;
+  size_t size;
+  int iteration;
+  float c0 = 1.0 / 6.0;
+  float c1 = 1.0 / 6.0 / 6.0;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
+
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  // load data from files
+
+  size = nx * ny * nz;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
+
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
+  llvm_visc_track_mem(h_A0, sizeof(float) * size);
+  llvm_visc_track_mem(h_Anext, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  // only use 1D thread block
+  size_t tx = 256;
+  size_t block[3] = {tx, 1, 1};
+  size_t grid[3] = {((unsigned)nx - 2 + tx - 1) / tx * tx, (unsigned)ny - 2,
+                    (unsigned)nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+
+  printf("grid(%ld, %ld, %ld), block(%ld, %ld, %ld)\n", grid[0], grid[1],
+         grid[2], block[0], block[1], block[2]);
+  // main execution
+
+  int t;
+  size_t bytes = size * sizeof(float);
+  printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126, 1, 1)]);
+  printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]);
+  for (t = 0; t < iteration; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
 
+    void *root_in = malloc(sizeof(RootIn));
+    RootIn root_in_local = {c0,
+                            c1,
+                            h_A0,
+                            bytes,
+                            h_Anext,
+                            bytes,
+                            nx,
+                            ny,
+                            nz,
+                            block[0],
+                            block[1],
+                            block[2],
+                            grid[0] / block[0],
+                            grid[1] / block[1],
+                            grid[2] / block[2]};
+    *(RootIn *)root_in = root_in_local;
+    void *stencilDFG = __visc__launch(0, stencilLvl3, root_in);
+
+    __visc__wait(stencilDFG);
+    // printf("iteration %d\n",t);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-
-    //only use 1D thread block
-    size_t tx = 256;
-    size_t block[3] = {tx,1,1};
-    size_t grid[3] = {((unsigned)nx-2+tx-1)/tx*tx,(unsigned)ny-2,(unsigned)nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-
-    printf("grid(%ld, %ld, %ld), block(%ld, %ld, %ld)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
-    //main execution
-
-    int t;
-    size_t bytes = size*sizeof(float);
-    printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126,1,1)]);
-    printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125,1,1)]);
-    for(t=0; t<iteration; t++)
-    {
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        
-        void* root_in = malloc(sizeof(RootIn));
-        RootIn root_in_local = {
-            c0, c1,
-            h_A0, bytes,
-            h_Anext, bytes,
-            nx, ny, nz,
-            block[0], block[1], block[2],
-            grid[0]/block[0], grid[1]/block[1], grid[2]/block[2]
-        };
-        *(RootIn*)root_in = root_in_local;
-        void* stencilDFG = __visc__launch(0, stencilLvl3, root_in);
-
-        __visc__wait(stencilDFG);
-        //printf("iteration %d\n",t);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        float* h_temp = h_A0;
-        h_A0 = h_Anext;
-        h_Anext = h_temp;
-    }
-
-
-    float* h_temp = h_A0;
+    float *h_temp = h_A0;
     h_A0 = h_Anext;
     h_Anext = h_temp;
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    llvm_visc_request_mem(h_Anext, bytes);
-    printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126,1,1)]);
-    printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125,1,1)]);
-
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  }
 
-    llvm_visc_untrack_mem(h_A0);
-    llvm_visc_untrack_mem(h_Anext);
+  float *h_temp = h_A0;
+  h_A0 = h_Anext;
+  h_Anext = h_temp;
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(h_Anext, bytes);
+  printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]);
+  printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
 
-    __visc__cleanup();
+  llvm_visc_untrack_mem(h_A0);
+  llvm_visc_untrack_mem(h_Anext);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext,nx,ny,nz);
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
+  __visc__cleanup();
 
-    return 0;
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h
index 042bd64a23d897959a4145e6d2b42df76053e74c..12a6d131c29067073fa79f09c4e6f91b8662969c 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/common.h
@@ -10,6 +10,6 @@
 #define _COMMON_H_
 //#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k)))
 // +3 for padding
-#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))+3)
+#define Index3D(_nx, _ny, _i, _j, _k) ((_i) + _nx * ((_j) + _ny * (_k)) + 3)
 #define TCF 4
 #endif
diff --git a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c
index 28fb87ac479a9270fbda08f958017fbf495130c1..bb6e45c932a68d951f5559bd856017ecf71aade6 100644
--- a/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c
+++ b/hpvm/test/parboil/benchmarks/stencil/src/visc_vec/stencil.c
@@ -7,179 +7,170 @@
  *cr
  ***************************************************************************/
 
+#include "common.h"
+#include "file.h"
+#include <parboil.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <parboil.h>
 #include <visc.h>
-#include "file.h"
-#include "common.h"
 
-static int read_data(float *A0, int nx,int ny,int nz,FILE *fp)
-{
-    int s=0;
-    int i,j,k;
-    for(i=0; i<nz; i++)
-    {
-        for(j=0; j<ny; j++)
-        {
-            for(k=0; k<nx; k++)
-            {
-                fread(A0+s,sizeof(float),1,fp);
-                s++;
-            }
-        }
+static int read_data(float *A0, int nx, int ny, int nz, FILE *fp) {
+  int s = 0;
+  int i, j, k;
+  for (i = 0; i < nz; i++) {
+    for (j = 0; j < ny; j++) {
+      for (k = 0; k < nx; k++) {
+        fread(A0 + s, sizeof(float), 1, fp);
+        s++;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-void naive_kernel(float c0,float c1, float* A0, float *Anext,int nx,int ny,int nz)
-{
-    __visc__attributes(2, A0, Anext, 1, Anext);
-    int i = get_global_id(0)+1;
-    int j = get_global_id(1)+1;
-    int k = get_global_id(2)+1;
-
-    if(i<nx-1)
-    {
-        Anext[Index3D (nx, ny, i, j, k)] = c1 *
-                                           ( A0[Index3D (nx, ny, i, j, k + 1)] +
-                                             A0[Index3D (nx, ny, i, j, k - 1)] +
-                                             A0[Index3D (nx, ny, i, j + 1, k)] +
-                                             A0[Index3D (nx, ny, i, j - 1, k)] +
-                                             A0[Index3D (nx, ny, i + 1, j, k)] +
-                                             A0[Index3D (nx, ny, i - 1, j, k)] )
-                                           - A0[Index3D (nx, ny, i, j, k)] * c0;
-    }
+void naive_kernel(float c0, float c1, float *A0, float *Anext, int nx, int ny,
+                  int nz) {
+  __visc__attributes(2, A0, Anext, 1, Anext);
+  int i = get_global_id(0) + 1;
+  int j = get_global_id(1) + 1;
+  int k = get_global_id(2) + 1;
+
+  if (i < nx - 1) {
+    Anext[Index3D(nx, ny, i, j, k)] = c1 * (A0[Index3D(nx, ny, i, j, k + 1)] +
+                                            A0[Index3D(nx, ny, i, j, k - 1)] +
+                                            A0[Index3D(nx, ny, i, j + 1, k)] +
+                                            A0[Index3D(nx, ny, i, j - 1, k)] +
+                                            A0[Index3D(nx, ny, i + 1, j, k)] +
+                                            A0[Index3D(nx, ny, i - 1, j, k)]) -
+                                      A0[Index3D(nx, ny, i, j, k)] * c0;
+  }
 }
 
-int main(int argc, char** argv) {
-    struct pb_TimerSet timers;
-    struct pb_Parameters *parameters;
-
-    printf("OpenCL accelerated 7 points stencil codes****\n");
-    printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
-    parameters = pb_ReadParameters(&argc, argv);
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-
-    //declaration
-    int nx,ny,nz;
-    size_t size;
-    int iteration;
-    float c0=1.0/6.0;
-    float c1=1.0/6.0/6.0;
-
-    if (argc<5)
-    {
-        printf("Usage: probe nx ny nz t\n"
-               "nx: the grid size x\n"
-               "ny: the grid size y\n"
-               "nz: the grid size z\n"
-               "t: the iteration time\n");
-        return -1;
-    }
-
-    nx = atoi(argv[1]);
-    if (nx<1)
-        return -1;
-    ny = atoi(argv[2]);
-    if (ny<1)
-        return -1;
-    nz = atoi(argv[3]);
-    if (nz<1)
-        return -1;
-    iteration = atoi(argv[4]);
-    if(iteration<1)
-        return -1;
-
-    //host data
-    float *h_A0;
-    float *h_Anext;
-
-    //load data from files
-
-    size=nx*ny*nz;
-
-    // Padding in the beginning to get aligned loads and stores
-    size = size+3;
-
-    h_A0=(float*)malloc(sizeof(float)*size);
-    h_Anext=(float*)malloc(sizeof(float)*size);
-
-
-    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-    FILE *fp = fopen(parameters->inpFiles[0], "rb");
-    read_data(h_A0+3, nx,ny,nz,fp);
-    fclose(fp);
-
-    pb_InitializeTimerSet(&timers);
-    __visc__init();
-
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
-    llvm_visc_track_mem(h_A0, sizeof(float)*size);
-    llvm_visc_track_mem(h_Anext, sizeof(float)*size);
-
+int main(int argc, char **argv) {
+  struct pb_TimerSet timers;
+  struct pb_Parameters *parameters;
+
+  printf("OpenCL accelerated 7 points stencil codes****\n");
+  printf("Author: Li-Wen Chang <lchang20@illinois.edu>\n");
+  parameters = pb_ReadParameters(&argc, argv);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+
+  // declaration
+  int nx, ny, nz;
+  size_t size;
+  int iteration;
+  float c0 = 1.0 / 6.0;
+  float c1 = 1.0 / 6.0 / 6.0;
+
+  if (argc < 5) {
+    printf("Usage: probe nx ny nz t\n"
+           "nx: the grid size x\n"
+           "ny: the grid size y\n"
+           "nz: the grid size z\n"
+           "t: the iteration time\n");
+    return -1;
+  }
+
+  nx = atoi(argv[1]);
+  if (nx < 1)
+    return -1;
+  ny = atoi(argv[2]);
+  if (ny < 1)
+    return -1;
+  nz = atoi(argv[3]);
+  if (nz < 1)
+    return -1;
+  iteration = atoi(argv[4]);
+  if (iteration < 1)
+    return -1;
+
+  // host data
+  float *h_A0;
+  float *h_Anext;
+
+  // load data from files
+
+  size = nx * ny * nz;
+
+  // Padding in the beginning to get aligned loads and stores
+  size = size + 3;
+
+  h_A0 = (float *)malloc(sizeof(float) * size);
+  h_Anext = (float *)malloc(sizeof(float) * size);
+
+  /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+  FILE *fp = fopen(parameters->inpFiles[0], "rb");
+  read_data(h_A0 + 3, nx, ny, nz, fp);
+  fclose(fp);
+
+  pb_InitializeTimerSet(&timers);
+  __visc__init();
+
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_TRACK);
+  llvm_visc_track_mem(h_A0, sizeof(float) * size);
+  llvm_visc_track_mem(h_Anext, sizeof(float) * size);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  memcpy(h_Anext, h_A0, sizeof(float) * size);
+
+  // only use 1D thread block
+  int tx = 256 / TCF;
+  int block[3] = {tx, 1, 1};
+  int grid[3] = {(nx - 2 + TCF * tx - 1) / (TCF * tx) * tx, ny - 2, nz - 2};
+  // size_t grid[3] = {nx-2,ny-2,nz-2};
+  size_t offset[3] = {1, 1, 1};
+
+  printf("grid(%d, %d, %d), block(%d, %d, %d)\n", grid[0], grid[1], grid[2],
+         block[0], block[1], block[2]);
+  // main execution
+
+  int t;
+  size_t bytes = size * sizeof(float);
+  printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126, 1, 1)]);
+  printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125, 1, 1)]);
+  for (t = 0; t < iteration; t++) {
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+    unsigned stencilDFG = __visc__node(
+        naive_kernel, 2, 3, block[0], block[1], block[2], grid[0] / block[0],
+        grid[1] / block[1], grid[2] / block[2], 9, (float)c0, (float)c1, h_A0,
+        bytes, h_Anext, bytes, nx, ny, nz, 0);
+    __visc__wait(stencilDFG);
+    // printf("iteration %d\n",t);
     pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-    memcpy (h_Anext,h_A0,sizeof(float)*size);
-
-
-    //only use 1D thread block
-    int tx =256/TCF;
-    int block[3] = {tx,1,1};
-    int grid[3] = {(nx-2+TCF*tx-1)/(TCF*tx)*tx,ny-2,nz-2};
-    //size_t grid[3] = {nx-2,ny-2,nz-2};
-    size_t offset[3] = {1,1,1};
-
-    printf("grid(%d, %d, %d), block(%d, %d, %d)\n", grid[0], grid[1], grid[2], block[0], block[1], block[2]);
-    //main execution
-
-    int t;
-    size_t bytes = size*sizeof(float);
-    printf("A[126,1,1] = %f\n", h_A0[Index3D(nx, ny, 126,1,1)]);
-    printf("A[125,1,1] = %f\n", h_A0[Index3D(nx, ny, 125,1,1)]);
-    for(t=0; t<iteration; t++)
-    {
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        unsigned stencilDFG = __visc__node(naive_kernel, 2, 3, block[0], block[1], block[2], grid[0]/block[0], grid[1]/block[1], grid[2]/block[2], 9, (float)c0, (float)c1, h_A0, bytes, h_Anext, bytes, nx, ny, nz, 0);
-        __visc__wait(stencilDFG);
-        //printf("iteration %d\n",t);
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-        float* h_temp = h_A0;
-        h_A0 = h_Anext;
-        h_Anext = h_temp;
-
-    }
-
-
-    float* h_temp = h_A0;
+    float *h_temp = h_A0;
     h_A0 = h_Anext;
     h_Anext = h_temp;
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    llvm_visc_request_mem(h_Anext, bytes);
-    printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126,1,1)]);
-    printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125,1,1)]);
+  }
 
-    pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
+  float *h_temp = h_A0;
+  h_A0 = h_Anext;
+  h_Anext = h_temp;
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  llvm_visc_request_mem(h_Anext, bytes);
+  printf("A[126,1,1] = %f\n", h_Anext[Index3D(nx, ny, 126, 1, 1)]);
+  printf("A[125,1,1] = %f\n", h_Anext[Index3D(nx, ny, 125, 1, 1)]);
 
-    llvm_visc_untrack_mem(h_A0);
-    llvm_visc_untrack_mem(h_Anext);
+  pb_SwitchToTimer(&timers, visc_TimerID_MEM_UNTRACK);
 
-    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-    pb_PrintTimerSet(&timers);
+  llvm_visc_untrack_mem(h_A0);
+  llvm_visc_untrack_mem(h_Anext);
 
-    __visc__cleanup();
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
 
-    if (parameters->outFile) {
-        /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
-        outputData(parameters->outFile,h_Anext+3,nx,ny,nz);
+  __visc__cleanup();
 
-    }
-    /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
-    free(h_A0);
-    free(h_Anext);
-    pb_FreeParameters(parameters);
-
-    return 0;
+  if (parameters->outFile) {
+    /*pb_SwitchToTimer(&timers, pb_TimerID_IO);*/
+    outputData(parameters->outFile, h_Anext + 3, nx, ny, nz);
+  }
+  /*pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);*/
+  free(h_A0);
+  free(h_Anext);
+  pb_FreeParameters(parameters);
+
+  return 0;
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.c
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c
index 2bafdf4580c5a6f4402cf40991c93bffcf8ce3ee..da9b51a7202148e43f9b5bf51156b0d651473571 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/main.c
@@ -5,18 +5,17 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
-#include <unistd.h>
 #include <sys/time.h>
-#include <math.h>
+#include <unistd.h>
 
 #include "args.h"
 #include "model.h"
 
-int main( int argc, char **argv )
-{
+int main(int argc, char **argv) {
   struct pb_TimerSet timers;
   struct pb_Parameters *params;
   int rf, k, nbins, npd, npr;
@@ -26,77 +25,65 @@ int main( int argc, char **argv )
   struct cartesian *data, *random;
   FILE *outfile;
 
-  pb_InitializeTimerSet( &timers );
-  params = pb_ReadParameters( &argc, argv );
+  pb_InitializeTimerSet(&timers);
+  params = pb_ReadParameters(&argc, argv);
 
   options args;
-  parse_args( argc, argv, &args );
-    
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - 
-					 log10(min_arcmin)));
-  memsize = (nbins+2)*sizeof(long long);
-    
+  parse_args(argc, argv, &args);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - log10(min_arcmin)));
+  memsize = (nbins + 2) * sizeof(long long);
+
   // memory for bin boundaries
-  binb = (float *)malloc((nbins+1)*sizeof(float));
-  if (binb == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
-  for (k = 0; k < nbins+1; k++)
-    {
-      binb[k] = cos(pow(10, log10(min_arcmin) + 
-			k*1.0/bins_per_dec) / 60.0*D2R);
-    }
-    
+  binb = (float *)malloc((nbins + 1) * sizeof(float));
+  if (binb == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
+  for (k = 0; k < nbins + 1; k++) {
+    binb[k] =
+        cos(pow(10, log10(min_arcmin) + k * 1.0 / bins_per_dec) / 60.0 * D2R);
+  }
+
   // memory for DD
-  DD = (long long*)malloc(memsize);
-  if (DD == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
+  DD = (long long *)malloc(memsize);
+  if (DD == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
   bzero(DD, memsize);
-    
+
   // memory for RR
-  RRS = (long long*)malloc(memsize);
-  if (RRS == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
+  RRS = (long long *)malloc(memsize);
+  if (RRS == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
   bzero(RRS, memsize);
-    
+
   // memory for DR
-  DRS = (long long*)malloc(memsize);
-  if (DRS == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
+  DRS = (long long *)malloc(memsize);
+  if (DRS == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
   bzero(DRS, memsize);
-    
+
   // memory for input data
-  data = (struct cartesian*)malloc
-    (args.npoints* sizeof(struct cartesian));
-  if (data == NULL)
-    {
-      fprintf(stderr, 
-	      "Unable to allocate memory for % data points (#1)\n", 
-	      args.npoints);
-      return(0);
-    }
-    
-  random = (struct cartesian*)malloc
-    (args.npoints*sizeof(struct cartesian));
-  if (random == NULL)
-    {
-      fprintf(stderr, 
-	      "Unable to allocate memory for % data points (#2)\n", 
-	      args.npoints);
-      return(0);
-    }
+  data = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian));
+  if (data == NULL) {
+    fprintf(stderr, "Unable to allocate memory for % data points (#1)\n",
+            args.npoints);
+    return (0);
+  }
+
+  random = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian));
+  if (random == NULL) {
+    fprintf(stderr, "Unable to allocate memory for % data points (#2)\n",
+            args.npoints);
+    return (0);
+  }
 
   printf("Min distance: %f arcmin\n", min_arcmin);
   printf("Max distance: %f arcmin\n", max_arcmin);
@@ -104,58 +91,51 @@ int main( int argc, char **argv )
   printf("Total bins  : %i\n", nbins);
 
   // read data file
-  pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
   npd = readdatafile(params->inpFiles[0], data, args.npoints);
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  if (npd != args.npoints)
-    {
-      fprintf(stderr, 
-	      "Error: read %i data points out of %i\n", 
-	      npd, args.npoints);
-      return(0);
-    }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  if (npd != args.npoints) {
+    fprintf(stderr, "Error: read %i data points out of %i\n", npd,
+            args.npoints);
+    return (0);
+  }
 
   // compute DD
   doCompute(data, npd, NULL, 0, 1, DD, nbins, binb);
 
   // loop through random data files
-  for (rf = 0; rf < args.random_count; rf++)
-    {
-      // read random file
-      pb_SwitchToTimer( &timers, pb_TimerID_IO );
-      npr = readdatafile(params->inpFiles[rf+1], random, args.npoints);
-      pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-      if (npr != args.npoints)
-        {
-	  fprintf(stderr, 
-		  "Error: read %i random points out of %i in file %s\n", 
-		  npr, args.npoints, params->inpFiles[rf+1]);
-	  return(0);
-        }
-
-      // compute RR
-      doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb);
-
-      // compute DR
-      doCompute(data, npd, random, npr, 0, DRS, nbins, binb);
+  for (rf = 0; rf < args.random_count; rf++) {
+    // read random file
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    npr = readdatafile(params->inpFiles[rf + 1], random, args.npoints);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    if (npr != args.npoints) {
+      fprintf(stderr, "Error: read %i random points out of %i in file %s\n",
+              npr, args.npoints, params->inpFiles[rf + 1]);
+      return (0);
     }
 
+    // compute RR
+    doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb);
+
+    // compute DR
+    doCompute(data, npd, random, npr, 0, DRS, nbins, binb);
+  }
+
   // compute and output results
-  if ((outfile = fopen(params->outFile, "w")) == NULL)
-    {
-      fprintf(stderr, 
-	      "Unable to open output file %s for writing, assuming stdout\n", 
-	      params->outFile);
-      outfile = stdout;
-    }
+  if ((outfile = fopen(params->outFile, "w")) == NULL) {
+    fprintf(stderr,
+            "Unable to open output file %s for writing, assuming stdout\n",
+            params->outFile);
+    outfile = stdout;
+  }
 
-  pb_SwitchToTimer( &timers, pb_TimerID_IO );
-  for (k = 1; k < nbins+1; k++)
-    {
-      fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]);      
-    }
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  for (k = 1; k < nbins + 1; k++) {
+    fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]);
+  }
 
-  if(outfile != stdout)
+  if (outfile != stdout)
     fclose(outfile);
 
   // free memory
@@ -165,9 +145,8 @@ int main( int argc, char **argv )
   free(DD);
   free(RRS);
   free(DRS);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_NONE );
-  pb_PrintTimerSet( &timers );
-  pb_FreeParameters( params );
-}
 
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+}
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h
index f2b182ba412bb3c7f91bb38b0e43ef1df498dbcf..14d4f40df6d2942140610375cf9568def79d631e 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/model.h
@@ -10,9 +10,9 @@
 
 #include <parboil.h>
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -22,21 +22,18 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
 
-int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, 
-	      int n2, int doSelf, long long *data_bins, 
-	      int nbins, float *binb);
+int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2,
+              int doSelf, long long *data_bins, int nbins, float *binb);
 
 void initBinB(struct pb_TimerSet *timers);
 
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c
index 1e2a114b4d7d1f97646dfa7a15ac477f2a7f1745..b74f27ffd443d6cf6727863b986ee187bada299f 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_compute_cpu.c
@@ -5,64 +5,51 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
-#include <sys/time.h>
-#include <string.h>
 #include <math.h>
-#include <stdio.h> 
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
 
 #include "model.h"
 
-int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, 
-	      int n2, int doSelf, long long *data_bins, 
-	      int nbins, float *binb)
-{
+int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2,
+              int doSelf, long long *data_bins, int nbins, float *binb) {
   int i, j, k;
-  if (doSelf)
-    {
-      n2 = n1;
-      data2 = data1;
-    }
-  
-  for (i = 0; i < ((doSelf) ? n1-1 : n1); i++)
-    {
-      const register float xi = data1[i].x;
-      const register float yi = data1[i].y;
-      const register float zi = data1[i].z;
-      
-      for (j = ((doSelf) ? i+1 : 0); j < n2; j++)
-        {
-	  register float dot = xi * data2[j].x + yi * data2[j].y + 
-	    zi * data2[j].z;
-	  
-	  // run binary search
-	  register int min = 0;
-	  register int max = nbins;
-	  register int k, indx;
-	  
-	  while (max > min+1)
-            {
-	      k = (min + max) / 2;
-	      if (dot >= binb[k]) 
-		max = k;
-	      else 
-		min = k;
-            };
-	  
-	  if (dot >= binb[min]) 
-	    {
-	      data_bins[min] += 1; /*k = min;*/ 
-	    }
-	  else if (dot < binb[max]) 
-	    { 
-	      data_bins[max+1] += 1; /*k = max+1;*/ 
-	    }
-	  else 
-	    { 
-	      data_bins[max] += 1; /*k = max;*/ 
-	    }
-        }
+  if (doSelf) {
+    n2 = n1;
+    data2 = data1;
+  }
+
+  for (i = 0; i < ((doSelf) ? n1 - 1 : n1); i++) {
+    const register float xi = data1[i].x;
+    const register float yi = data1[i].y;
+    const register float zi = data1[i].z;
+
+    for (j = ((doSelf) ? i + 1 : 0); j < n2; j++) {
+      register float dot = xi * data2[j].x + yi * data2[j].y + zi * data2[j].z;
+
+      // run binary search
+      register int min = 0;
+      register int max = nbins;
+      register int k, indx;
+
+      while (max > min + 1) {
+        k = (min + max) / 2;
+        if (dot >= binb[k])
+          max = k;
+        else
+          min = k;
+      };
+
+      if (dot >= binb[min]) {
+        data_bins[min] += 1; /*k = min;*/
+      } else if (dot < binb[max]) {
+        data_bins[max + 1] += 1; /*k = max+1;*/
+      } else {
+        data_bins[max] += 1; /*k = max;*/
+      }
     }
-  
+  }
+
   return 0;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c
index 3ee12500dcb5ccbc7f36b9db1da41d5e12f93126..ddc37cfb2b288b6bf8d5ebbd84ccd34d563e26fe 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/base/model_io.c
@@ -5,45 +5,40 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
-#include <sys/time.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
 #include <strings.h>
-#include <math.h>
+#include <sys/time.h>
 
 #include "model.h"
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
-
-        {
-          // data conversion
-	  float rarad = D2R * ra;
-	  float decrad = D2R * dec;
-	  float cd = cos(decrad);
-	  
-	  data[lcount].x = cos(rarad) * cd;
-	  data[lcount].y = sin(rarad) * cd;
-	  data[lcount].z = sin(decrad);
-        }
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
+
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.cc
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h
index a8a855872a20342e58a0906faf0c73cc9763f355..fdde265dd1f43582c3b97231189878ee9ea35b5f 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model.h
@@ -10,9 +10,9 @@
 
 #include <parboil.h>
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -23,21 +23,19 @@
 #define SINGLE_PRECISION 1
 
 #if SINGLE_PRECISION
-  #define REAL float
+#define REAL float
 #else
-  #define REAL double
+#define REAL double
 #endif
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  REAL ra, dec;  // latitude, longitude pair
+struct spherical {
+  REAL ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  REAL x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  REAL x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc
index 23a21458f35ebb3d43dcd127691556650ca399d7..182f9ed43ef5579601c17d080a8cdea4d487da09 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda/model_io.cc
@@ -5,49 +5,44 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
-#include <sys/time.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
 #include <strings.h>
-#include <math.h>
+#include <sys/time.h>
 
 #include "model.h"
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   REAL ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+#if SINGLE_PRECISION
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+#else
+    if (fscanf(infile, "%lf %lf", &ra, &dec) != 2)
+#endif
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      #if SINGLE_PRECISION
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-      #else
-      if (fscanf(infile, "%lf %lf", &ra, &dec) != 2)
-      #endif
-	break;
-
-      {
-        // data conversion
-        REAL rarad = D2R * ra;
-        REAL decrad = D2R * dec;
-        REAL cd = cos(decrad);
-	
-	data[lcount].x = cos(rarad) * cd;
-	data[lcount].y = sin(rarad) * cd;
-	data[lcount].z = sin(decrad);
-      }
+      // data conversion
+      REAL rarad = D2R * ra;
+      REAL decrad = D2R * dec;
+      REAL cd = cos(decrad);
+
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.cc
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h
index a3c273f0c5b45f8486728ef6fee8b1ab9404136e..a4ffce895fbbf4846681ace0848f2ecbe5a5e741 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model.h
@@ -10,9 +10,9 @@
 
 #include <parboil.h>
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -22,14 +22,12 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc
index 1374ba4b19e7352a7717241e7e2f662cc7c18fad..ddc37cfb2b288b6bf8d5ebbd84ccd34d563e26fe 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/cuda_base/model_io.cc
@@ -5,45 +5,40 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
-#include <sys/time.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
 #include <strings.h>
-#include <math.h>
+#include <sys/time.h>
 
 #include "model.h"
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
-
-      {
-        // data conversion
-        float rarad = D2R * ra;
-        float decrad = D2R * dec;
-        float cd = cos(decrad);
-	
-	data[lcount].x = cos(rarad) * cd;
-	data[lcount].y = sin(rarad) * cd;
-	data[lcount].z = sin(decrad);
-      }
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
+
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.c
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c
index 2bafdf4580c5a6f4402cf40991c93bffcf8ce3ee..da9b51a7202148e43f9b5bf51156b0d651473571 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/main.c
@@ -5,18 +5,17 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
-#include <unistd.h>
 #include <sys/time.h>
-#include <math.h>
+#include <unistd.h>
 
 #include "args.h"
 #include "model.h"
 
-int main( int argc, char **argv )
-{
+int main(int argc, char **argv) {
   struct pb_TimerSet timers;
   struct pb_Parameters *params;
   int rf, k, nbins, npd, npr;
@@ -26,77 +25,65 @@ int main( int argc, char **argv )
   struct cartesian *data, *random;
   FILE *outfile;
 
-  pb_InitializeTimerSet( &timers );
-  params = pb_ReadParameters( &argc, argv );
+  pb_InitializeTimerSet(&timers);
+  params = pb_ReadParameters(&argc, argv);
 
   options args;
-  parse_args( argc, argv, &args );
-    
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - 
-					 log10(min_arcmin)));
-  memsize = (nbins+2)*sizeof(long long);
-    
+  parse_args(argc, argv, &args);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - log10(min_arcmin)));
+  memsize = (nbins + 2) * sizeof(long long);
+
   // memory for bin boundaries
-  binb = (float *)malloc((nbins+1)*sizeof(float));
-  if (binb == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
-  for (k = 0; k < nbins+1; k++)
-    {
-      binb[k] = cos(pow(10, log10(min_arcmin) + 
-			k*1.0/bins_per_dec) / 60.0*D2R);
-    }
-    
+  binb = (float *)malloc((nbins + 1) * sizeof(float));
+  if (binb == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
+  for (k = 0; k < nbins + 1; k++) {
+    binb[k] =
+        cos(pow(10, log10(min_arcmin) + k * 1.0 / bins_per_dec) / 60.0 * D2R);
+  }
+
   // memory for DD
-  DD = (long long*)malloc(memsize);
-  if (DD == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
+  DD = (long long *)malloc(memsize);
+  if (DD == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
   bzero(DD, memsize);
-    
+
   // memory for RR
-  RRS = (long long*)malloc(memsize);
-  if (RRS == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
+  RRS = (long long *)malloc(memsize);
+  if (RRS == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
   bzero(RRS, memsize);
-    
+
   // memory for DR
-  DRS = (long long*)malloc(memsize);
-  if (DRS == NULL)
-    {
-      fprintf(stderr, "Unable to allocate memory\n");
-      exit(-1);
-    }
+  DRS = (long long *)malloc(memsize);
+  if (DRS == NULL) {
+    fprintf(stderr, "Unable to allocate memory\n");
+    exit(-1);
+  }
   bzero(DRS, memsize);
-    
+
   // memory for input data
-  data = (struct cartesian*)malloc
-    (args.npoints* sizeof(struct cartesian));
-  if (data == NULL)
-    {
-      fprintf(stderr, 
-	      "Unable to allocate memory for % data points (#1)\n", 
-	      args.npoints);
-      return(0);
-    }
-    
-  random = (struct cartesian*)malloc
-    (args.npoints*sizeof(struct cartesian));
-  if (random == NULL)
-    {
-      fprintf(stderr, 
-	      "Unable to allocate memory for % data points (#2)\n", 
-	      args.npoints);
-      return(0);
-    }
+  data = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian));
+  if (data == NULL) {
+    fprintf(stderr, "Unable to allocate memory for % data points (#1)\n",
+            args.npoints);
+    return (0);
+  }
+
+  random = (struct cartesian *)malloc(args.npoints * sizeof(struct cartesian));
+  if (random == NULL) {
+    fprintf(stderr, "Unable to allocate memory for % data points (#2)\n",
+            args.npoints);
+    return (0);
+  }
 
   printf("Min distance: %f arcmin\n", min_arcmin);
   printf("Max distance: %f arcmin\n", max_arcmin);
@@ -104,58 +91,51 @@ int main( int argc, char **argv )
   printf("Total bins  : %i\n", nbins);
 
   // read data file
-  pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
   npd = readdatafile(params->inpFiles[0], data, args.npoints);
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  if (npd != args.npoints)
-    {
-      fprintf(stderr, 
-	      "Error: read %i data points out of %i\n", 
-	      npd, args.npoints);
-      return(0);
-    }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  if (npd != args.npoints) {
+    fprintf(stderr, "Error: read %i data points out of %i\n", npd,
+            args.npoints);
+    return (0);
+  }
 
   // compute DD
   doCompute(data, npd, NULL, 0, 1, DD, nbins, binb);
 
   // loop through random data files
-  for (rf = 0; rf < args.random_count; rf++)
-    {
-      // read random file
-      pb_SwitchToTimer( &timers, pb_TimerID_IO );
-      npr = readdatafile(params->inpFiles[rf+1], random, args.npoints);
-      pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-      if (npr != args.npoints)
-        {
-	  fprintf(stderr, 
-		  "Error: read %i random points out of %i in file %s\n", 
-		  npr, args.npoints, params->inpFiles[rf+1]);
-	  return(0);
-        }
-
-      // compute RR
-      doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb);
-
-      // compute DR
-      doCompute(data, npd, random, npr, 0, DRS, nbins, binb);
+  for (rf = 0; rf < args.random_count; rf++) {
+    // read random file
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    npr = readdatafile(params->inpFiles[rf + 1], random, args.npoints);
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    if (npr != args.npoints) {
+      fprintf(stderr, "Error: read %i random points out of %i in file %s\n",
+              npr, args.npoints, params->inpFiles[rf + 1]);
+      return (0);
     }
 
+    // compute RR
+    doCompute(random, npr, NULL, 0, 1, RRS, nbins, binb);
+
+    // compute DR
+    doCompute(data, npd, random, npr, 0, DRS, nbins, binb);
+  }
+
   // compute and output results
-  if ((outfile = fopen(params->outFile, "w")) == NULL)
-    {
-      fprintf(stderr, 
-	      "Unable to open output file %s for writing, assuming stdout\n", 
-	      params->outFile);
-      outfile = stdout;
-    }
+  if ((outfile = fopen(params->outFile, "w")) == NULL) {
+    fprintf(stderr,
+            "Unable to open output file %s for writing, assuming stdout\n",
+            params->outFile);
+    outfile = stdout;
+  }
 
-  pb_SwitchToTimer( &timers, pb_TimerID_IO );
-  for (k = 1; k < nbins+1; k++)
-    {
-      fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]);      
-    }
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+  for (k = 1; k < nbins + 1; k++) {
+    fprintf(outfile, "%d\n%d\n%d\n", DD[k], DRS[k], RRS[k]);
+  }
 
-  if(outfile != stdout)
+  if (outfile != stdout)
     fclose(outfile);
 
   // free memory
@@ -165,9 +145,8 @@ int main( int argc, char **argv )
   free(DD);
   free(RRS);
   free(DRS);
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_NONE );
-  pb_PrintTimerSet( &timers );
-  pb_FreeParameters( params );
-}
 
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(params);
+}
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h
index f2b182ba412bb3c7f91bb38b0e43ef1df498dbcf..14d4f40df6d2942140610375cf9568def79d631e 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model.h
@@ -10,9 +10,9 @@
 
 #include <parboil.h>
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -22,21 +22,18 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
 
-int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, 
-	      int n2, int doSelf, long long *data_bins, 
-	      int nbins, float *binb);
+int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2,
+              int doSelf, long long *data_bins, int nbins, float *binb);
 
 void initBinB(struct pb_TimerSet *timers);
 
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c
index 25f9e4400c4545ba4acce2183fff76f89ab94ed5..d6f0dee83a044590f37de32426de2b4ff3cd56d6 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_compute_cpu.c
@@ -5,69 +5,55 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
-#include <sys/time.h>
-#include <string.h>
 #include <math.h>
-#include <stdio.h> 
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
 
 #include "model.h"
 
-int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, 
-	      int n2, int doSelf, long long *data_bins, 
-	      int nbins, float *binb)
-{
+int doCompute(struct cartesian *data1, int n1, struct cartesian *data2, int n2,
+              int doSelf, long long *data_bins, int nbins, float *binb) {
   int i, j, k;
-  if (doSelf)
-    {
-      n2 = n1;
-      data2 = data1;
-    }
-//  #pragma omp parallel for 
-  for (i = 0; i < ((doSelf) ? n1-1 : n1); i++)
-    {
-      const register float xi = data1[i].x;
-      const register float yi = data1[i].y;
-      const register float zi = data1[i].z;
+  if (doSelf) {
+    n2 = n1;
+    data2 = data1;
+  }
+  //  #pragma omp parallel for
+  for (i = 0; i < ((doSelf) ? n1 - 1 : n1); i++) {
+    const register float xi = data1[i].x;
+    const register float yi = data1[i].y;
+    const register float zi = data1[i].z;
+
+#pragma omp parallel for
+    for (j = ((doSelf) ? i + 1 : 0); j < n2; j++) {
+      register float dot = xi * data2[j].x + yi * data2[j].y + zi * data2[j].z;
 
-      #pragma omp parallel for      
-      for (j = ((doSelf) ? i+1 : 0); j < n2; j++)
-        {
-	  register float dot = xi * data2[j].x + yi * data2[j].y + 
-	    zi * data2[j].z;
-	  
-	  // run binary search
-	  register int min = 0;
-	  register int max = nbins;
-	  register int k, indx;
-	  
+      // run binary search
+      register int min = 0;
+      register int max = nbins;
+      register int k, indx;
 
-	  while (max > min+1)
-            {
-	      k = (min + max) / 2;
-	      if (dot >= binb[k]) 
-		max = k;
-	      else 
-		min = k;
-            };
-    #pragma omp critical	  
-	  if (dot >= binb[min]) 
-	    {
-//        #pragma omp critical
-	      data_bins[min] += 1; /*k = min;*/ 
-	    }
-	  else if (dot < binb[max]) 
-	    { 
-  //      #pragma omp critical
-	      data_bins[max+1] += 1; /*k = max+1;*/ 
-	    }
-	  else 
-	    { 
-    //    #pragma omp critical
-	      data_bins[max] += 1; /*k = max;*/ 
-	    }
-        }
+      while (max > min + 1) {
+        k = (min + max) / 2;
+        if (dot >= binb[k])
+          max = k;
+        else
+          min = k;
+      };
+#pragma omp critical
+      if (dot >= binb[min]) {
+        //        #pragma omp critical
+        data_bins[min] += 1; /*k = min;*/
+      } else if (dot < binb[max]) {
+        //      #pragma omp critical
+        data_bins[max + 1] += 1; /*k = max+1;*/
+      } else {
+        //    #pragma omp critical
+        data_bins[max] += 1; /*k = max;*/
+      }
     }
-  
+  }
+
   return 0;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c
index 3ee12500dcb5ccbc7f36b9db1da41d5e12f93126..ddc37cfb2b288b6bf8d5ebbd84ccd34d563e26fe 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/omp_base/model_io.c
@@ -5,45 +5,40 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
-#include <sys/time.h>
-#include <stdio.h>
 #include <math.h>
+#include <stdio.h>
 #include <strings.h>
-#include <math.h>
+#include <sys/time.h>
 
 #include "model.h"
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
-
-        {
-          // data conversion
-	  float rarad = D2R * ra;
-	  float decrad = D2R * dec;
-	  float cd = cos(decrad);
-	  
-	  data[lcount].x = cos(rarad) * cd;
-	  data[lcount].y = sin(rarad) * cd;
-	  data[lcount].z = sin(decrad);
-        }
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
+
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.cc
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc
index 49d9f243c48352ea866e35add863aacb002a3a55..d945bccf4eae7f296394d74ac0617f3e20426dcd 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/main.cc
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #include "args.h"
 #include "model.h"
@@ -19,114 +19,109 @@ extern unsigned int NUM_SETS;
 extern unsigned int NUM_ELEMENTS;
 
 // create the bin boundaries
-void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue)
-{
-  float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float));
-  for (int k = 0; k < NUM_BINS+1; k++)
-    {
-      binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) 
-		    / 60.0*D2R);
-    }
+void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb,
+              cl_command_queue clCommandQueue) {
+  float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float));
+  for (int k = 0; k < NUM_BINS + 1; k++) {
+    binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) /
+                  60.0 * D2R);
+  }
 
-  pb_SwitchToTimer( timers, pb_TimerID_COPY );
+  pb_SwitchToTimer(timers, pb_TimerID_COPY);
 
   cl_int clStatus;
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL);
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0,
+                           (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
   free(binb);
 }
 
-void TPACF(cl_mem histograms, cl_mem d_x_data, 
-	   cl_mem dev_binb, 
-	   cl_command_queue clCommandQueue, cl_kernel clKernel)
-{
+void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb,
+           cl_command_queue clCommandQueue, cl_kernel clKernel) {
   size_t dimBlock = BLOCK_SIZE;
-  size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock;
-  
+  size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock;
+
   cl_int clStatus;
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_x_data);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&dev_binb);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&NUM_SETS);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(int),&NUM_ELEMENTS);
-  
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_x_data);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &dev_binb);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &NUM_SETS);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), &NUM_ELEMENTS);
+
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid,
+                                    &dimBlock, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int 
-main( int argc, char** argv) 
-{
+int main(int argc, char **argv) {
   struct pb_TimerSet timers;
   struct pb_Parameters *params;
 
-  params = pb_ReadParameters( &argc, argv );
+  params = pb_ReadParameters(&argc, argv);
 
   options args;
   parse_args(argc, argv, &args);
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   NUM_ELEMENTS = args.npoints;
   NUM_SETS = args.random_count;
-  int num_elements = NUM_ELEMENTS; 
-  
+  int num_elements = NUM_ELEMENTS;
+
   printf("Min distance: %f arcmin\n", min_arcmin);
   printf("Max distance: %f arcmin\n", max_arcmin);
   printf("Bins per dec: %i\n", bins_per_dec);
   printf("Total bins  : %i\n", NUM_BINS);
 
-  //read in files 
-  unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian);
-  unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float);
+  // read in files
+  unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian);
+  unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float);
 
   // container for all the points read from files
   struct cartesian *h_all_data;
-  h_all_data = (struct cartesian*) malloc(mem_size); 
+  h_all_data = (struct cartesian *)malloc(mem_size);
   // Until I can get libs fixed
-    
+
   // iterator for data files
   struct cartesian *working = h_all_data;
-    
+
   // go through and read all data and random points into h_all_data
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   readdatafile(params->inpFiles[0], working, num_elements);
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   working += num_elements;
-  for(int i = 0; i < (NUM_SETS); i++)
-    {
-      //pb_SwitchToTimer( &timers, pb_TimerID_IO );
-      char fileName[50];
-      readdatafile(params->inpFiles[i+1], working, num_elements);
-      //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-      working += num_elements;
-    }
+  for (int i = 0; i < (NUM_SETS); i++) {
+    // pb_SwitchToTimer( &timers, pb_TimerID_IO );
+    char fileName[50];
+    readdatafile(params->inpFiles[i + 1], working, num_elements);
+    // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    working += num_elements;
+  }
 
-  pb_InitializeTimerSet( &timers );
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // split into x, y, and z arrays
-  float * h_x_data = (float*) malloc (3*f_mem_size);
-  float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1);
-  float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1);
-  for(int i = 0; i < (NUM_SETS+1); ++i)
-    {
-      for(int j = 0; j < NUM_ELEMENTS; ++j)
-	{
-	  h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x;
-	  h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y;
-	  h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z;
-	}
+  float *h_x_data = (float *)malloc(3 * f_mem_size);
+  float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  for (int i = 0; i < (NUM_SETS + 1); ++i) {
+    for (int j = 0; j < NUM_ELEMENTS; ++j) {
+      h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x;
+      h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y;
+      h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z;
     }
+  }
 
   // from on use x, y, and z arrays, free h_all_data
   free(h_all_data);
@@ -134,136 +129,141 @@ main( int argc, char** argv)
   cl_int clStatus;
 
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_base");
+  sprintf(clOptions, "-I src/opencl_base");
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus);
   CHECK_ERROR("clCreateKernel")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
   // allocate OpenCL memory to hold all points
-  //Sub-buffers are not defined in OpenCL 1.0
+  // Sub-buffers are not defined in OpenCL 1.0
   cl_mem d_x_data;
-  d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus);
+  d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL,
+                            &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // allocate OpenCL memory to hold final histograms
   // (1 for dd, and NUM_SETS for dr and rr apiece)
   cl_mem d_hists;
-  d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus);
+  d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                           NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL,
+                           &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   cl_mem dev_binb;
-  dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus);
+  dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                            (NUM_BINS + 1) * sizeof(float), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // allocate system memory for final histograms
-  hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)*
-					sizeof(hist_t));
+  hist_t *new_hists =
+      (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t));
 
   // Initialize the boundary constants for bin search
   initBinB(&timers, dev_binb, clCommandQueue);
 
   // **===------------------ Kick off TPACF on OpenCL------------------===**
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0,
+                                  3 * f_mem_size, h_x_data, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
-  TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel);
+  TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0,
+                                 NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t),
+                                 new_hists, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // **===-------------------------------------------------------------===**
 
   // references into output histograms
   hist_t *dd_hist = new_hists;
   hist_t *rr_hist = dd_hist + NUM_BINS;
-  hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS;
+  hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS;
 
   // add up values within dr and rr
   int rr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      rr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  rr[j] += rr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    rr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      rr[j] += rr_hist[i * NUM_BINS + j];
     }
+  }
   int dr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      dr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  dr[j] += dr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    dr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      dr[j] += dr_hist[i * NUM_BINS + j];
     }
+  }
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
   FILE *outfile;
-  if ((outfile = fopen(params->outFile, "w")) == NULL)
-    {
-      fprintf(stderr, "Unable to open output file %s for writing, "
-	      "assuming stdout\n", params->outFile);
-      outfile = stdout;
-    }
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  if ((outfile = fopen(params->outFile, "w")) == NULL) {
+    fprintf(stderr,
+            "Unable to open output file %s for writing, "
+            "assuming stdout\n",
+            params->outFile);
+    outfile = stdout;
+  }
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   // print out final histograms + omega (while calculating omega)
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
-    }
+  for (int i = 0; i < NUM_BINS; i++) {
+    fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
+  }
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  if(outfile != stdout)
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  if (outfile != stdout)
     fclose(outfile);
 
   // cleanup memory
   free(new_hists);
-  free( h_x_data);
+  free(h_x_data);
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COPY );
   clStatus = clReleaseMemObject(d_hists);
   clStatus = clReleaseMemObject(d_x_data);
   clStatus = clReleaseMemObject(dev_binb);
@@ -273,8 +273,7 @@ main( int argc, char** argv)
   clStatus = clReleaseContext(clContext);
   CHECK_ERROR("clReleaseContext")
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   pb_FreeParameters(params);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc
index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.cc
@@ -6,83 +6,75 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <sys/time.h>
-#include <stdio.h>
-#include <math.h>
-#include <strings.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <strings.h>
+#include <sys/time.h>
 
 #include "model.h"
 
 unsigned int NUM_SETS;
 unsigned int NUM_ELEMENTS;
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
 
-      {
-        // data conversion
-        float rarad = D2R * ra;
-        float decrad = D2R * dec;
-        float cd = cos(decrad);
-	
-	data[lcount].x = cos(rarad) * cd;
-	data[lcount].y = sin(rarad) * cd;
-	data[lcount].z = sin(decrad);
-      }
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error: Cannot open kernel file for reading!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error: Cannot open kernel file for reading!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error: Cannot allocated buffer for file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error: Cannot allocated buffer for file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error: Cannot read kernel file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error: Cannot read kernel file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h
index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base/model.h
@@ -8,9 +8,9 @@
 #ifndef __MODEL_H__
 #define __MODEL_H__
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -21,26 +21,23 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
 
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.cc
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc
index 54f4f85539b2e71a24d220f08a8f59373d0976ce..453a983ee0dc19b6f4b6d32e883bf822407b281a 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/main.cc
@@ -6,147 +6,142 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #include "args.h"
 #include "model.h"
 
-
 #define WARP_SIZE 32
 
-#define NUM_WARPS (BLOCK_SIZE/WARP_SIZE)
+#define NUM_WARPS (BLOCK_SIZE / WARP_SIZE)
 #define HISTS_PER_WARP 16
-#define NUM_HISTOGRAMS  (NUM_WARPS*HISTS_PER_WARP)
+#define NUM_HISTOGRAMS (NUM_WARPS * HISTS_PER_WARP)
 
 extern unsigned int NUM_SETS;
 extern unsigned int NUM_ELEMENTS;
 
 // create the bin boundaries
-void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue)
-{
-  float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float));
-  for (int k = 0; k < NUM_BINS+1; k++)
-    {
-      binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) 
-		    / 60.0*D2R);
-    }
+void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb,
+              cl_command_queue clCommandQueue) {
+  float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float));
+  for (int k = 0; k < NUM_BINS + 1; k++) {
+    binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) /
+                  60.0 * D2R);
+  }
 
-  pb_SwitchToTimer( timers, pb_TimerID_COPY );
+  pb_SwitchToTimer(timers, pb_TimerID_COPY);
 
   cl_int clStatus;
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL);
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0,
+                           (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
   free(binb);
 }
 
-void TPACF(cl_mem histograms, cl_mem d_x_data, 
-	   cl_mem dev_binb, 
-	   cl_command_queue clCommandQueue, cl_kernel clKernel)
-{
+void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb,
+           cl_command_queue clCommandQueue, cl_kernel clKernel) {
   size_t dimBlock = BLOCK_SIZE;
-  size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock;
-  long shSize = NUM_BINS*NUM_HISTOGRAMS*sizeof(unsigned int);
+  size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock;
+  long shSize = NUM_BINS * NUM_HISTOGRAMS * sizeof(unsigned int);
   long glSize = 0L;
 
   cl_int clStatus;
 
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(long),&glSize);
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(long), &glSize);
+
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &d_x_data);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(long), &glSize);
 
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&d_x_data);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(long),&glSize);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(cl_mem), &dev_binb);
+  clStatus = clSetKernelArg(clKernel, 5, sizeof(long), &glSize);
 
-  clStatus = clSetKernelArg(clKernel,4,sizeof(cl_mem),&dev_binb);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(long),&glSize);
+  clStatus = clSetKernelArg(clKernel, 6, sizeof(int), &NUM_SETS);
 
-  clStatus = clSetKernelArg(clKernel,6,sizeof(int),&NUM_SETS);
+  clStatus = clSetKernelArg(clKernel, 7, sizeof(int), &NUM_ELEMENTS);
 
-  clStatus = clSetKernelArg(clKernel,7,sizeof(int),&NUM_ELEMENTS);
+  clStatus = clSetKernelArg(
+      clKernel, 8, NUM_BINS * NUM_HISTOGRAMS * sizeof(unsigned int), NULL);
+  clStatus = clSetKernelArg(clKernel, 9, sizeof(long), &shSize);
 
-  clStatus = clSetKernelArg(clKernel,8,NUM_BINS*NUM_HISTOGRAMS*sizeof(unsigned int),NULL);
-  clStatus = clSetKernelArg(clKernel,9,sizeof(long),&shSize);
-  
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid,
+                                    &dimBlock, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int 
-main( int argc, char** argv) 
-{
+int main(int argc, char **argv) {
   struct pb_TimerSet timers;
   struct pb_Parameters *params;
 
-  params = pb_ReadParameters( &argc, argv );
+  params = pb_ReadParameters(&argc, argv);
 
   options args;
   parse_args(argc, argv, &args);
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   NUM_ELEMENTS = args.npoints;
   NUM_SETS = args.random_count;
-  int num_elements = NUM_ELEMENTS; 
-  
+  int num_elements = NUM_ELEMENTS;
+
   printf("Min distance: %f arcmin\n", min_arcmin);
   printf("Max distance: %f arcmin\n", max_arcmin);
   printf("Bins per dec: %i\n", bins_per_dec);
   printf("Total bins  : %i\n", NUM_BINS);
 
-  //read in files 
-  unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian);
-  unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float);
+  // read in files
+  unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian);
+  unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float);
 
   // container for all the points read from files
   struct cartesian *h_all_data;
-  h_all_data = (struct cartesian*) malloc(mem_size); 
+  h_all_data = (struct cartesian *)malloc(mem_size);
   // Until I can get libs fixed
-    
+
   // iterator for data files
   struct cartesian *working = h_all_data;
-    
+
   // go through and read all data and random points into h_all_data
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   readdatafile(params->inpFiles[0], working, num_elements);
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   working += num_elements;
-  for(int i = 0; i < (NUM_SETS); i++)
-    {
-      //pb_SwitchToTimer( &timers, pb_TimerID_IO );
-      char fileName[50];
-      readdatafile(params->inpFiles[i+1], working, num_elements);
-      //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-      working += num_elements;
-    }
+  for (int i = 0; i < (NUM_SETS); i++) {
+    // pb_SwitchToTimer( &timers, pb_TimerID_IO );
+    char fileName[50];
+    readdatafile(params->inpFiles[i + 1], working, num_elements);
+    // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    working += num_elements;
+  }
 
-  pb_InitializeTimerSet( &timers );
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // split into x, y, and z arrays
-  float * h_x_data = (float*) malloc (3*f_mem_size);
-  float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1);
-  float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1);
-  for(int i = 0; i < (NUM_SETS+1); ++i)
-    {
-      for(int j = 0; j < NUM_ELEMENTS; ++j)
-	{
-	  h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x;
-	  h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y;
-	  h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z;
-	}
+  float *h_x_data = (float *)malloc(3 * f_mem_size);
+  float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  for (int i = 0; i < (NUM_SETS + 1); ++i) {
+    for (int j = 0; j < NUM_ELEMENTS; ++j) {
+      h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x;
+      h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y;
+      h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z;
     }
+  }
 
   // from on use x, y, and z arrays, free h_all_data
   free(h_all_data);
@@ -154,136 +149,141 @@ main( int argc, char** argv)
   cl_int clStatus;
 
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_base_dynamic1d/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_base_dynamic1d/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_base_dynamic1d");
+  sprintf(clOptions, "-I src/opencl_base_dynamic1d");
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus);
   CHECK_ERROR("clCreateKernel")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
   // allocate OpenCL memory to hold all points
-  //Sub-buffers are not defined in OpenCL 1.0
+  // Sub-buffers are not defined in OpenCL 1.0
   cl_mem d_x_data;
-  d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus);
+  d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL,
+                            &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // allocate OpenCL memory to hold final histograms
   // (1 for dd, and NUM_SETS for dr and rr apiece)
   cl_mem d_hists;
-  d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus);
+  d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                           NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL,
+                           &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   cl_mem dev_binb;
-  dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus);
+  dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                            (NUM_BINS + 1) * sizeof(float), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // allocate system memory for final histograms
-  hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)*
-					sizeof(hist_t));
+  hist_t *new_hists =
+      (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t));
 
   // Initialize the boundary constants for bin search
   initBinB(&timers, dev_binb, clCommandQueue);
 
   // **===------------------ Kick off TPACF on OpenCL------------------===**
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0,
+                                  3 * f_mem_size, h_x_data, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
-  TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel);
+  TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0,
+                                 NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t),
+                                 new_hists, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // **===-------------------------------------------------------------===**
 
   // references into output histograms
   hist_t *dd_hist = new_hists;
   hist_t *rr_hist = dd_hist + NUM_BINS;
-  hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS;
+  hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS;
 
   // add up values within dr and rr
   int rr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      rr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  rr[j] += rr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    rr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      rr[j] += rr_hist[i * NUM_BINS + j];
     }
+  }
   int dr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      dr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  dr[j] += dr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    dr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      dr[j] += dr_hist[i * NUM_BINS + j];
     }
+  }
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
   FILE *outfile;
-  if ((outfile = fopen(params->outFile, "w")) == NULL)
-    {
-      fprintf(stderr, "Unable to open output file %s for writing, "
-	      "assuming stdout\n", params->outFile);
-      outfile = stdout;
-    }
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  if ((outfile = fopen(params->outFile, "w")) == NULL) {
+    fprintf(stderr,
+            "Unable to open output file %s for writing, "
+            "assuming stdout\n",
+            params->outFile);
+    outfile = stdout;
+  }
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   // print out final histograms + omega (while calculating omega)
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
-    }
+  for (int i = 0; i < NUM_BINS; i++) {
+    fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
+  }
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  if(outfile != stdout)
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  if (outfile != stdout)
     fclose(outfile);
 
   // cleanup memory
   free(new_hists);
-  free( h_x_data);
+  free(h_x_data);
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COPY );
   clStatus = clReleaseMemObject(d_hists);
   clStatus = clReleaseMemObject(d_x_data);
   clStatus = clReleaseMemObject(dev_binb);
@@ -293,8 +293,7 @@ main( int argc, char** argv)
   clStatus = clReleaseContext(clContext);
   CHECK_ERROR("clReleaseContext")
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   pb_FreeParameters(params);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc
index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.cc
@@ -6,83 +6,75 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <sys/time.h>
-#include <stdio.h>
-#include <math.h>
-#include <strings.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <strings.h>
+#include <sys/time.h>
 
 #include "model.h"
 
 unsigned int NUM_SETS;
 unsigned int NUM_ELEMENTS;
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
 
-      {
-        // data conversion
-        float rarad = D2R * ra;
-        float decrad = D2R * dec;
-        float cd = cos(decrad);
-	
-	data[lcount].x = cos(rarad) * cd;
-	data[lcount].y = sin(rarad) * cd;
-	data[lcount].z = sin(decrad);
-      }
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error: Cannot open kernel file for reading!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error: Cannot open kernel file for reading!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error: Cannot allocated buffer for file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error: Cannot allocated buffer for file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error: Cannot read kernel file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error: Cannot read kernel file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h
index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_base_dynamic1d/model.h
@@ -8,9 +8,9 @@
 #ifndef __MODEL_H__
 #define __MODEL_H__
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -21,26 +21,23 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
 
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.cc
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc
index cf57cd447fb6612dc8eb7c849aa46fe34cedc40b..791b5fbdd6aa70359d37ca5a85139c7f8374c56d 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/main.cc
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #include "args.h"
 #include "model.h"
@@ -19,114 +19,109 @@ extern unsigned int NUM_SETS;
 extern unsigned int NUM_ELEMENTS;
 
 // create the bin boundaries
-void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue)
-{
-  float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float));
-  for (int k = 0; k < NUM_BINS+1; k++)
-    {
-      binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) 
-		    / 60.0*D2R);
-    }
+void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb,
+              cl_command_queue clCommandQueue) {
+  float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float));
+  for (int k = 0; k < NUM_BINS + 1; k++) {
+    binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) /
+                  60.0 * D2R);
+  }
 
-  pb_SwitchToTimer( timers, pb_TimerID_COPY );
+  pb_SwitchToTimer(timers, pb_TimerID_COPY);
 
   cl_int clStatus;
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL);
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0,
+                           (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
   free(binb);
 }
 
-void TPACF(cl_mem histograms, cl_mem d_x_data, 
-	   cl_mem dev_binb, 
-	   cl_command_queue clCommandQueue, cl_kernel clKernel)
-{
+void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb,
+           cl_command_queue clCommandQueue, cl_kernel clKernel) {
   size_t dimBlock = BLOCK_SIZE;
-  size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock;
-  
+  size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock;
+
   cl_int clStatus;
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_x_data);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&dev_binb);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&NUM_SETS);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(int),&NUM_ELEMENTS);
-  
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_x_data);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &dev_binb);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &NUM_SETS);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), &NUM_ELEMENTS);
+
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid,
+                                    &dimBlock, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int 
-main( int argc, char** argv) 
-{
+int main(int argc, char **argv) {
   struct pb_TimerSet timers;
   struct pb_Parameters *params;
 
-  params = pb_ReadParameters( &argc, argv );
+  params = pb_ReadParameters(&argc, argv);
 
   options args;
   parse_args(argc, argv, &args);
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   NUM_ELEMENTS = args.npoints;
   NUM_SETS = args.random_count;
-  int num_elements = NUM_ELEMENTS; 
-  
+  int num_elements = NUM_ELEMENTS;
+
   printf("Min distance: %f arcmin\n", min_arcmin);
   printf("Max distance: %f arcmin\n", max_arcmin);
   printf("Bins per dec: %i\n", bins_per_dec);
   printf("Total bins  : %i\n", NUM_BINS);
 
-  //read in files 
-  unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian);
-  unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float);
+  // read in files
+  unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian);
+  unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float);
 
   // container for all the points read from files
   struct cartesian *h_all_data;
-  h_all_data = (struct cartesian*) malloc(mem_size); 
+  h_all_data = (struct cartesian *)malloc(mem_size);
   // Until I can get libs fixed
-    
+
   // iterator for data files
   struct cartesian *working = h_all_data;
-    
+
   // go through and read all data and random points into h_all_data
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   readdatafile(params->inpFiles[0], working, num_elements);
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   working += num_elements;
-  for(int i = 0; i < (NUM_SETS); i++)
-    {
-      //pb_SwitchToTimer( &timers, pb_TimerID_IO );
-      char fileName[50];
-      readdatafile(params->inpFiles[i+1], working, num_elements);
-      //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-      working += num_elements;
-    }
+  for (int i = 0; i < (NUM_SETS); i++) {
+    // pb_SwitchToTimer( &timers, pb_TimerID_IO );
+    char fileName[50];
+    readdatafile(params->inpFiles[i + 1], working, num_elements);
+    // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    working += num_elements;
+  }
 
-  pb_InitializeTimerSet( &timers );
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // split into x, y, and z arrays
-  float * h_x_data = (float*) malloc (3*f_mem_size);
-  float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1);
-  float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1);
-  for(int i = 0; i < (NUM_SETS+1); ++i)
-    {
-      for(int j = 0; j < NUM_ELEMENTS; ++j)
-	{
-	  h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x;
-	  h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y;
-	  h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z;
-	}
+  float *h_x_data = (float *)malloc(3 * f_mem_size);
+  float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  for (int i = 0; i < (NUM_SETS + 1); ++i) {
+    for (int j = 0; j < NUM_ELEMENTS; ++j) {
+      h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x;
+      h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y;
+      h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z;
     }
+  }
 
   // from on use x, y, and z arrays, free h_all_data
   free(h_all_data);
@@ -134,139 +129,145 @@ main( int argc, char** argv)
   cl_int clStatus;
 
   cl_uint numPlatforms;
-  clStatus  = clGetPlatformIDs(0, NULL, &numPlatforms);
+  clStatus = clGetPlatformIDs(0, NULL, &numPlatforms);
 
   cl_platform_id clPlatform[numPlatforms];
   clStatus = clGetPlatformIDs(numPlatforms, clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform[1],CL_DEVICE_TYPE_CPU,1,&clDevice,NULL);
+  clStatus =
+      clGetDeviceIDs(clPlatform[1], CL_DEVICE_TYPE_CPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform[1],0};
-  cl_context clContext = clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform[1], 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_CPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_cpu_base/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_cpu_base/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_base");
+  sprintf(clOptions, "-I src/opencl_base");
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus);
   CHECK_ERROR("clCreateKernel")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
   // allocate OpenCL memory to hold all points
-  //Sub-buffers are not defined in OpenCL 1.0
+  // Sub-buffers are not defined in OpenCL 1.0
   cl_mem d_x_data;
-  d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus);
+  d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL,
+                            &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // allocate OpenCL memory to hold final histograms
   // (1 for dd, and NUM_SETS for dr and rr apiece)
   cl_mem d_hists;
-  d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus);
+  d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                           NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL,
+                           &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   cl_mem dev_binb;
-  dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus);
+  dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                            (NUM_BINS + 1) * sizeof(float), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // allocate system memory for final histograms
-  hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)*
-					sizeof(hist_t));
+  hist_t *new_hists =
+      (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t));
 
   // Initialize the boundary constants for bin search
   initBinB(&timers, dev_binb, clCommandQueue);
 
   // **===------------------ Kick off TPACF on OpenCL------------------===**
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0,
+                                  3 * f_mem_size, h_x_data, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
 
-  TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel);
+  TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0,
+                                 NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t),
+                                 new_hists, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // **===-------------------------------------------------------------===**
 
   // references into output histograms
   hist_t *dd_hist = new_hists;
   hist_t *rr_hist = dd_hist + NUM_BINS;
-  hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS;
+  hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS;
 
   // add up values within dr and rr
   int rr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      rr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  rr[j] += rr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    rr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      rr[j] += rr_hist[i * NUM_BINS + j];
     }
+  }
   int dr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      dr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  dr[j] += dr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    dr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      dr[j] += dr_hist[i * NUM_BINS + j];
     }
+  }
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
   FILE *outfile;
-  if ((outfile = fopen(params->outFile, "w")) == NULL)
-    {
-      fprintf(stderr, "Unable to open output file %s for writing, "
-	      "assuming stdout\n", params->outFile);
-      outfile = stdout;
-    }
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  if ((outfile = fopen(params->outFile, "w")) == NULL) {
+    fprintf(stderr,
+            "Unable to open output file %s for writing, "
+            "assuming stdout\n",
+            params->outFile);
+    outfile = stdout;
+  }
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   // print out final histograms + omega (while calculating omega)
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
-    }
+  for (int i = 0; i < NUM_BINS; i++) {
+    fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
+  }
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  if(outfile != stdout)
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  if (outfile != stdout)
     fclose(outfile);
 
   // cleanup memory
   free(new_hists);
-  free( h_x_data);
+  free(h_x_data);
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COPY );
   clStatus = clReleaseMemObject(d_hists);
   clStatus = clReleaseMemObject(d_x_data);
   clStatus = clReleaseMemObject(dev_binb);
@@ -276,8 +277,7 @@ main( int argc, char** argv)
   clStatus = clReleaseContext(clContext);
   CHECK_ERROR("clReleaseContext")
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   pb_FreeParameters(params);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc
index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.cc
@@ -6,83 +6,75 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <sys/time.h>
-#include <stdio.h>
-#include <math.h>
-#include <strings.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <strings.h>
+#include <sys/time.h>
 
 #include "model.h"
 
 unsigned int NUM_SETS;
 unsigned int NUM_ELEMENTS;
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
 
-      {
-        // data conversion
-        float rarad = D2R * ra;
-        float decrad = D2R * dec;
-        float cd = cos(decrad);
-	
-	data[lcount].x = cos(rarad) * cd;
-	data[lcount].y = sin(rarad) * cd;
-	data[lcount].z = sin(decrad);
-      }
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error: Cannot open kernel file for reading!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error: Cannot open kernel file for reading!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error: Cannot allocated buffer for file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error: Cannot allocated buffer for file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error: Cannot read kernel file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error: Cannot read kernel file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h
index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_cpu_base/model.h
@@ -8,9 +8,9 @@
 #ifndef __MODEL_H__
 #define __MODEL_H__
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -21,26 +21,23 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
 
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.cc
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc
index e6f4e6a0f92991e87a640d9bac61a419eccf8569..773fc20258b3301ee507e9957efae089a074d047 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/main.cc
@@ -6,11 +6,11 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #include "args.h"
 #include "model.h"
@@ -19,114 +19,109 @@ extern unsigned int NUM_SETS;
 extern unsigned int NUM_ELEMENTS;
 
 // create the bin boundaries
-void initBinB( struct pb_TimerSet *timers, cl_mem dev_binb, cl_command_queue clCommandQueue)
-{
-  float *binb = (float*)malloc((NUM_BINS+1)*sizeof(float));
-  for (int k = 0; k < NUM_BINS+1; k++)
-    {
-      binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec)) 
-		    / 60.0*D2R);
-    }
+void initBinB(struct pb_TimerSet *timers, cl_mem dev_binb,
+              cl_command_queue clCommandQueue) {
+  float *binb = (float *)malloc((NUM_BINS + 1) * sizeof(float));
+  for (int k = 0; k < NUM_BINS + 1; k++) {
+    binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) /
+                  60.0 * D2R);
+  }
 
-  pb_SwitchToTimer( timers, pb_TimerID_COPY );
+  pb_SwitchToTimer(timers, pb_TimerID_COPY);
 
   cl_int clStatus;
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,dev_binb,CL_TRUE,0,(NUM_BINS+1)*sizeof(float),binb,0,NULL,NULL);
+  clStatus =
+      clEnqueueWriteBuffer(clCommandQueue, dev_binb, CL_TRUE, 0,
+                           (NUM_BINS + 1) * sizeof(float), binb, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
   free(binb);
 }
 
-void TPACF(cl_mem histograms, cl_mem d_x_data, 
-	   cl_mem dev_binb, 
-	   cl_command_queue clCommandQueue, cl_kernel clKernel)
-{
+void TPACF(cl_mem histograms, cl_mem d_x_data, cl_mem dev_binb,
+           cl_command_queue clCommandQueue, cl_kernel clKernel) {
   size_t dimBlock = BLOCK_SIZE;
-  size_t dimGrid = (NUM_SETS*2 + 1)*dimBlock;
-  
+  size_t dimGrid = (NUM_SETS * 2 + 1) * dimBlock;
+
   cl_int clStatus;
-  clStatus = clSetKernelArg(clKernel,0,sizeof(cl_mem),&histograms);
-  clStatus = clSetKernelArg(clKernel,1,sizeof(cl_mem),&d_x_data);
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&dev_binb);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&NUM_SETS);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(int),&NUM_ELEMENTS);
-  
+  clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &histograms);
+  clStatus = clSetKernelArg(clKernel, 1, sizeof(cl_mem), &d_x_data);
+  clStatus = clSetKernelArg(clKernel, 2, sizeof(cl_mem), &dev_binb);
+  clStatus = clSetKernelArg(clKernel, 3, sizeof(int), &NUM_SETS);
+  clStatus = clSetKernelArg(clKernel, 4, sizeof(int), &NUM_ELEMENTS);
+
   CHECK_ERROR("clSetKernelArg")
 
-  clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,1,NULL,&dimGrid,&dimBlock,0,NULL,NULL);
+  clStatus = clEnqueueNDRangeKernel(clCommandQueue, clKernel, 1, NULL, &dimGrid,
+                                    &dimBlock, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueNDRangeKernel")
 
   clStatus = clFinish(clCommandQueue);
   CHECK_ERROR("clFinish")
 }
 
-int 
-main( int argc, char** argv) 
-{
+int main(int argc, char **argv) {
   struct pb_TimerSet timers;
   struct pb_Parameters *params;
 
-  params = pb_ReadParameters( &argc, argv );
+  params = pb_ReadParameters(&argc, argv);
 
   options args;
   parse_args(argc, argv, &args);
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   NUM_ELEMENTS = args.npoints;
   NUM_SETS = args.random_count;
-  int num_elements = NUM_ELEMENTS; 
-  
+  int num_elements = NUM_ELEMENTS;
+
   printf("Min distance: %f arcmin\n", min_arcmin);
   printf("Max distance: %f arcmin\n", max_arcmin);
   printf("Bins per dec: %i\n", bins_per_dec);
   printf("Total bins  : %i\n", NUM_BINS);
 
-  //read in files 
-  unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian);
-  unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float);
+  // read in files
+  unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian);
+  unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float);
 
   // container for all the points read from files
   struct cartesian *h_all_data;
-  h_all_data = (struct cartesian*) malloc(mem_size); 
+  h_all_data = (struct cartesian *)malloc(mem_size);
   // Until I can get libs fixed
-    
+
   // iterator for data files
   struct cartesian *working = h_all_data;
-    
+
   // go through and read all data and random points into h_all_data
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   readdatafile(params->inpFiles[0], working, num_elements);
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   working += num_elements;
-  for(int i = 0; i < (NUM_SETS); i++)
-    {
-      //pb_SwitchToTimer( &timers, pb_TimerID_IO );
-      char fileName[50];
-      readdatafile(params->inpFiles[i+1], working, num_elements);
-      //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-
-      working += num_elements;
-    }
+  for (int i = 0; i < (NUM_SETS); i++) {
+    // pb_SwitchToTimer( &timers, pb_TimerID_IO );
+    char fileName[50];
+    readdatafile(params->inpFiles[i + 1], working, num_elements);
+    // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+    working += num_elements;
+  }
 
-  pb_InitializeTimerSet( &timers );
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // split into x, y, and z arrays
-  float * h_x_data = (float*) malloc (3*f_mem_size);
-  float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1);
-  float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1);
-  for(int i = 0; i < (NUM_SETS+1); ++i)
-    {
-      for(int j = 0; j < NUM_ELEMENTS; ++j)
-	{
-	  h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x;
-	  h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y;
-	  h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z;
-	}
+  float *h_x_data = (float *)malloc(3 * f_mem_size);
+  float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  for (int i = 0; i < (NUM_SETS + 1); ++i) {
+    for (int j = 0; j < NUM_ELEMENTS; ++j) {
+      h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x;
+      h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y;
+      h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z;
     }
+  }
 
   // from on use x, y, and z arrays, free h_all_data
   free(h_all_data);
@@ -134,150 +129,157 @@ main( int argc, char** argv)
   cl_int clStatus;
 
   cl_platform_id clPlatform;
-  clStatus = clGetPlatformIDs(1,&clPlatform,NULL);
+  clStatus = clGetPlatformIDs(1, &clPlatform, NULL);
   CHECK_ERROR("clGetPlatformIDs")
 
-  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,(cl_context_properties)clPlatform,0};
-  cl_context clContext = clCreateContextFromType(clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+  cl_context_properties clCps[3] = {CL_CONTEXT_PLATFORM,
+                                    (cl_context_properties)clPlatform, 0};
+  cl_context clContext =
+      clCreateContextFromType(clCps, CL_DEVICE_TYPE_GPU, NULL, NULL, &clStatus);
   CHECK_ERROR("clCreateContextFromType")
-   
+
   cl_device_id clDevice;
-  clStatus = clGetDeviceIDs(clPlatform,CL_DEVICE_TYPE_GPU,1,&clDevice,NULL);
+  clStatus = clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 1, &clDevice, NULL);
   CHECK_ERROR("clGetDeviceIDs")
 
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  cl_command_queue clCommandQueue = clCreateCommandQueue(
+      clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
   CHECK_ERROR("clCreateCommandQueue")
 
   pb_SetOpenCL(&clContext, &clCommandQueue);
 
-  const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-  cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  const char *clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+  cl_program clProgram =
+      clCreateProgramWithSource(clContext, 1, clSource, NULL, &clStatus);
   CHECK_ERROR("clCreateProgramWithSource")
 
   char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_nvidia");
+  sprintf(clOptions, "-I src/opencl_nvidia");
 
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  clStatus = clBuildProgram(clProgram, 1, &clDevice, clOptions, NULL, NULL);
   CHECK_ERROR("clBuildProgram")
 
-  cl_kernel clKernel = clCreateKernel(clProgram,"gen_hists",&clStatus);
+  cl_kernel clKernel = clCreateKernel(clProgram, "gen_hists", &clStatus);
   CHECK_ERROR("clCreateKernel")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
- 
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+
   // Get program binary
   // Query binary (PTX file) size
-    size_t bin_sz;
-    clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bin_sz, NULL);
- 
-    // Read binary (PTX file) to memory buffer
-    unsigned char *bin = (unsigned char *)malloc(bin_sz);
-    clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
- 
-    // Save PTX to add_vectors_ocl.ptx
-    FILE* fp = fopen("tpacf.nvptx.s", "wb");
-    fwrite(bin, sizeof(char), bin_sz, fp);
-    fclose(fp);
-    free(bin); 
+  size_t bin_sz;
+  clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARY_SIZES,
+                              sizeof(size_t), &bin_sz, NULL);
+
+  // Read binary (PTX file) to memory buffer
+  unsigned char *bin = (unsigned char *)malloc(bin_sz);
+  clStatus = clGetProgramInfo(clProgram, CL_PROGRAM_BINARIES,
+                              sizeof(unsigned char *), &bin, NULL);
+
+  // Save PTX to add_vectors_ocl.ptx
+  FILE *fp = fopen("tpacf.nvptx.s", "wb");
+  fwrite(bin, sizeof(char), bin_sz, fp);
+  fclose(fp);
+  free(bin);
   // allocate OpenCL memory to hold all points
-  //Sub-buffers are not defined in OpenCL 1.0
+  // Sub-buffers are not defined in OpenCL 1.0
   cl_mem d_x_data;
-  d_x_data = clCreateBuffer(clContext,CL_MEM_READ_ONLY,3*f_mem_size,NULL,&clStatus);
+  d_x_data = clCreateBuffer(clContext, CL_MEM_READ_ONLY, 3 * f_mem_size, NULL,
+                            &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   // allocate OpenCL memory to hold final histograms
   // (1 for dd, and NUM_SETS for dr and rr apiece)
   cl_mem d_hists;
-  d_hists = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),NULL,&clStatus);
+  d_hists = clCreateBuffer(clContext, CL_MEM_WRITE_ONLY,
+                           NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t), NULL,
+                           &clStatus);
   CHECK_ERROR("clCreateBuffer")
 
   cl_mem dev_binb;
-  dev_binb = clCreateBuffer(clContext,CL_MEM_READ_ONLY,(NUM_BINS+1)*sizeof(float),NULL,&clStatus);
+  dev_binb = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
+                            (NUM_BINS + 1) * sizeof(float), NULL, &clStatus);
   CHECK_ERROR("clCreateBuffer")
-  
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
 
   // allocate system memory for final histograms
-  hist_t *new_hists = (hist_t *) malloc(NUM_BINS*(NUM_SETS*2+1)*
-					sizeof(hist_t));
+  hist_t *new_hists =
+      (hist_t *)malloc(NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t));
 
   // Initialize the boundary constants for bin search
   initBinB(&timers, dev_binb, clCommandQueue);
 
   // **===------------------ Kick off TPACF on OpenCL------------------===**
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,d_x_data,CL_TRUE,0,3*f_mem_size,h_x_data,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueWriteBuffer(clCommandQueue, d_x_data, CL_TRUE, 0,
+                                  3 * f_mem_size, h_x_data, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueWriteBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_KERNEL );
+  pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
 
-  TPACF(d_hists,d_x_data,dev_binb,clCommandQueue,clKernel);
+  TPACF(d_hists, d_x_data, dev_binb, clCommandQueue, clKernel);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
-  clStatus = clEnqueueReadBuffer(clCommandQueue,d_hists,CL_TRUE,0,NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t),new_hists,0,NULL,NULL);
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue, d_hists, CL_TRUE, 0,
+                                 NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t),
+                                 new_hists, 0, NULL, NULL);
   CHECK_ERROR("clEnqueueReadBuffer")
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // **===-------------------------------------------------------------===**
 
   // references into output histograms
   hist_t *dd_hist = new_hists;
   hist_t *rr_hist = dd_hist + NUM_BINS;
-  hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS;
+  hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS;
 
   // add up values within dr and rr
   int rr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      rr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  rr[j] += rr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    rr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      rr[j] += rr_hist[i * NUM_BINS + j];
     }
+  }
   int dr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      dr[i] = 0;
-    }
-  for(int i=0; i<NUM_SETS; i++)
-    {
-      for(int j=0; j<NUM_BINS; j++)
-	{
-	  dr[j] += dr_hist[i*NUM_BINS + j];
-	}
+  for (int i = 0; i < NUM_BINS; i++) {
+    dr[i] = 0;
+  }
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      dr[j] += dr_hist[i * NUM_BINS + j];
     }
+  }
 
   pb_SwitchToTimer(&timers, pb_TimerID_NONE);
   pb_PrintTimerSet(&timers);
 
   FILE *outfile;
-  if ((outfile = fopen(params->outFile, "w")) == NULL)
-    {
-      fprintf(stderr, "Unable to open output file %s for writing, "
-	      "assuming stdout\n", params->outFile);
-      outfile = stdout;
-    }
-  
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  if ((outfile = fopen(params->outFile, "w")) == NULL) {
+    fprintf(stderr,
+            "Unable to open output file %s for writing, "
+            "assuming stdout\n",
+            params->outFile);
+    outfile = stdout;
+  }
+
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   // print out final histograms + omega (while calculating omega)
-  for(int i=0; i<NUM_BINS; i++)
-    {
-      fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
-    }
+  for (int i = 0; i < NUM_BINS; i++) {
+    fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
+  }
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  if(outfile != stdout)
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  if (outfile != stdout)
     fclose(outfile);
 
   // cleanup memory
   free(new_hists);
-  free( h_x_data);
+  free(h_x_data);
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COPY );
   clStatus = clReleaseMemObject(d_hists);
   clStatus = clReleaseMemObject(d_x_data);
   clStatus = clReleaseMemObject(dev_binb);
@@ -287,8 +289,7 @@ main( int argc, char** argv)
   clStatus = clReleaseContext(clContext);
   CHECK_ERROR("clReleaseContext")
 
-  free((void*)clSource[0]);
+  free((void *)clSource[0]);
 
   pb_FreeParameters(params);
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc
index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.cc
@@ -6,83 +6,75 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <sys/time.h>
-#include <stdio.h>
-#include <math.h>
-#include <strings.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <strings.h>
+#include <sys/time.h>
 
 #include "model.h"
 
 unsigned int NUM_SETS;
 unsigned int NUM_ELEMENTS;
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
 
-      {
-        // data conversion
-        float rarad = D2R * ra;
-        float decrad = D2R * dec;
-        float cd = cos(decrad);
-	
-	data[lcount].x = cos(rarad) * cd;
-	data[lcount].y = sin(rarad) * cd;
-	data[lcount].z = sin(decrad);
-      }
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error: Cannot open kernel file for reading!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error: Cannot open kernel file for reading!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error: Cannot allocated buffer for file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error: Cannot allocated buffer for file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error: Cannot read kernel file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error: Cannot read kernel file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h
index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/opencl_nvidia/model.h
@@ -8,9 +8,9 @@
 #ifndef __MODEL_H__
 #define __MODEL_H__
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -21,26 +21,23 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
 
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc
index 6302a6a903468a830de03ac0ba62bc16da5aa37d..247ff6971c5a3d69a529aa3c1c9546dbd2346c54 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.cc
@@ -5,22 +5,21 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
+#include "args.h"
+#include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
-#include <stdio.h>
-#include "args.h"
 
 extern char *optarg;
 
-void usage(char *name)
-{
+void usage(char *name) {
   printf("Usage: %s <-d data_file_name> <-r rnd_file_name> "
-	 "<-m rnd_count> <-p count> <-o file_name>\n", name);
+         "<-m rnd_count> <-p count> <-o file_name>\n",
+         name);
   exit(0);
 }
 
-void parse_args(int argc, char **argv, options* args)
-{
+void parse_args(int argc, char **argv, options *args) {
   int c;
 
   args->data_name = NULL;
@@ -28,28 +27,26 @@ void parse_args(int argc, char **argv, options* args)
   args->random_count = 0;
   args->npoints = 0;
   args->output_name = NULL;
-  
-  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF)
-    {
-      switch (c)
-	{
-        case 'd':
-          args->data_name = optarg;
-          break;
-        case 'r':
-          args->random_name = optarg;
-          break;
-        case 'n':
-          args->random_count = atoi(optarg);
-          break;
-        case 'o':
-          args->output_name = optarg;
-          break;
-        case 'p':
-          args->npoints = atol(optarg);
-          break;
-        default:
-          usage(argv[0]);
-	}
+
+  while ((c = getopt(argc, argv, "d:n:r:p:o:")) != EOF) {
+    switch (c) {
+    case 'd':
+      args->data_name = optarg;
+      break;
+    case 'r':
+      args->random_name = optarg;
+      break;
+    case 'n':
+      args->random_count = atoi(optarg);
+      break;
+    case 'o':
+      args->output_name = optarg;
+      break;
+    case 'p':
+      args->npoints = atol(optarg);
+      break;
+    default:
+      usage(argv[0]);
     }
+  }
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h
index 97f9d61ad70a8cf8a721186afccdb28c57d41c70..33f3d84a4825e8c0df758bd2d4dea7ddc1ed8949 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/args.h
@@ -8,8 +8,7 @@
 #ifndef __ARGS_H__
 #define __ARGS_H__
 
-typedef struct _options_
-{
+typedef struct _options_ {
   char *data_name;
   char *random_name;
   int random_count;
@@ -18,6 +17,6 @@ typedef struct _options_
 } options;
 
 void usage(char *name);
-void parse_args(int argc, char **argv, options* args);
+void parse_args(int argc, char **argv, options *args);
 
 #endif
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc
index d1482d732947aefc2f3eafb380f584680e692f7f..3239be6c92f641422f2ba6910894ae68cc8b220e 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/main.cc
@@ -5,11 +5,11 @@
  *cr                         All Rights Reserved
  *cr
  ***************************************************************************/
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
 
 #include "args.h"
 #include "model.h"
@@ -19,45 +19,35 @@
 extern unsigned int NUM_SETS;
 extern unsigned int NUM_ELEMENTS;
 
-
 #define WARP_SIZE 32
 #define NUM_BANKS 16
 #define LOG_NUM_BANKS 4
 
 #define BLOCK_SIZE 256
-#define NUM_WARPS (BLOCK_SIZE/WARP_SIZE)
+#define NUM_WARPS (BLOCK_SIZE / WARP_SIZE)
 #define HISTS_PER_WARP 16
-#define NUM_HISTOGRAMS  (NUM_WARPS*HISTS_PER_WARP)
-#define THREADS_PER_HIST (WARP_SIZE/HISTS_PER_WARP)
-
-#define warp_hists(x,y) warp_hists[(x)*NUM_HISTOGRAMS+(y)]
+#define NUM_HISTOGRAMS (NUM_WARPS * HISTS_PER_WARP)
+#define THREADS_PER_HIST (WARP_SIZE / HISTS_PER_WARP)
 
+#define warp_hists(x, y) warp_hists[(x)*NUM_HISTOGRAMS + (y)]
 
 typedef struct __attribute__((__packed__)) {
-  hist_t* histograms;
+  hist_t *histograms;
   size_t bytes_histograms;
-  float* all_x_data;
+  float *all_x_data;
   size_t bytes_all_data;
-  float* binb;
+  float *binb;
   size_t bytes_binb;
   int NUM_SETS;
   int NUM_ELEMENTS;
   long block;
   long grid;
-}
-RootIn;
-
-void packData(  RootIn* args,
-                hist_t* histograms,
-                size_t bytes_histograms,
-                float* all_x_data,
-                size_t bytes_all_data,
-                float* binb,
-                size_t bytes_binb,
-                int NUM_SETS,
-                int NUM_ELEMENTS,
-                long block,
-                long grid) {
+} RootIn;
+
+void packData(RootIn *args, hist_t *histograms, size_t bytes_histograms,
+              float *all_x_data, size_t bytes_all_data, float *binb,
+              size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS, long block,
+              long grid) {
   args->histograms = histograms;
   args->bytes_histograms = bytes_histograms;
   args->all_x_data = all_x_data;
@@ -70,205 +60,186 @@ void packData(  RootIn* args,
   args->grid = grid;
 }
 
-
 void Allocation(long block) {
   // Memory shared between threadblocks
-  //void* data_s = __visc__malloc(sizeof(struct cartesian)*BLOCK_SIZE);
-  void* warp_hists = __visc__malloc(sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS);
+  // void* data_s = __visc__malloc(sizeof(struct cartesian)*BLOCK_SIZE);
+  void *warp_hists =
+      __visc__malloc(sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS);
 
   //__visc__return(data_s, sizeof(struct cartesian)*BLOCK_SIZE,
-                 //warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS);
-  __visc__return(2, warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS);
+  // warp_hists, sizeof(unsigned int)*NUM_BINS*NUM_HISTOGRAMS);
+  __visc__return(2, warp_hists,
+                 sizeof(unsigned int) * NUM_BINS * NUM_HISTOGRAMS);
 }
 
-void TPACFLeaf(hist_t* histograms, size_t bytes_histograms,
+void TPACFLeaf(hist_t *histograms, size_t bytes_histograms,
                // next two args are read-only arrays
-               float* all_x_data, size_t bytes_all_data,
-               float* binb, size_t bytes_binb,
-               int NUM_SETS, int NUM_ELEMENTS,
+               float *all_x_data, size_t bytes_all_data, float *binb,
+               size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS,
                // shared memory args
-               //struct cartesian* data_s, size_t bytes_data_s,
-               unsigned int* warp_hists, size_t bytes_warp_hists) {
+               // struct cartesian* data_s, size_t bytes_data_s,
+               unsigned int *warp_hists, size_t bytes_warp_hists) {
 
   __visc__hint(visc::DEVICE);
   __visc__attributes(2, all_x_data, binb, 1, histograms);
 
-  void* thisNode = __visc__getNode();
-  void* parentNode = __visc__getParentNode(thisNode);
+  void *thisNode = __visc__getNode();
+  void *parentNode = __visc__getParentNode(thisNode);
   int lx = __visc__getNodeInstanceID_x(thisNode);
   int gx = __visc__getNodeInstanceID_x(parentNode);
   int dimx = __visc__getNumNodeInstances_x(thisNode);
 
-  float* all_y_data = all_x_data + NUM_ELEMENTS*(NUM_SETS+1); 
-  float* all_z_data = all_y_data + NUM_ELEMENTS*(NUM_SETS+1);
+  float *all_y_data = all_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  float *all_z_data = all_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
 
   unsigned int bx = gx;
   unsigned int tid = lx;
   bool do_self = (bx < (NUM_SETS + 1));
 
-  float* data_x;
-  float* data_y;
-  float* data_z;
-  float* random_x;
-  float* random_y;
-  float* random_z;
-
-  for(unsigned int w = 0; w < NUM_BINS*NUM_HISTOGRAMS; w += BLOCK_SIZE )
-    {
-      if((w+tid) < (NUM_BINS*NUM_HISTOGRAMS))
-	{
-	  warp_hists((w+tid)/NUM_HISTOGRAMS, (w+tid)%NUM_HISTOGRAMS) = 0;
-	}
+  float *data_x;
+  float *data_y;
+  float *data_z;
+  float *random_x;
+  float *random_y;
+  float *random_z;
+
+  for (unsigned int w = 0; w < NUM_BINS * NUM_HISTOGRAMS; w += BLOCK_SIZE) {
+    if ((w + tid) < (NUM_BINS * NUM_HISTOGRAMS)) {
+      warp_hists((w + tid) / NUM_HISTOGRAMS, (w + tid) % NUM_HISTOGRAMS) = 0;
     }
+  }
 
   // Get stuff into shared memory to kick off the loop.
-  if( !do_self)
-    {
-      data_x = all_x_data;
-      data_y = all_y_data;
-      data_z = all_z_data;
-
-      random_x = all_x_data + NUM_ELEMENTS * (bx - NUM_SETS);
-      random_y = all_y_data + NUM_ELEMENTS * (bx - NUM_SETS);
-      random_z = all_z_data + NUM_ELEMENTS * (bx - NUM_SETS);
-    }
-  else
-    {
-      random_x = all_x_data + NUM_ELEMENTS * (bx);
-      random_y = all_y_data + NUM_ELEMENTS * (bx);
-      random_z = all_z_data + NUM_ELEMENTS * (bx);
-      
-      data_x = random_x;
-      data_y = random_y;
-      data_z = random_z;
-    }
- 
+  if (!do_self) {
+    data_x = all_x_data;
+    data_y = all_y_data;
+    data_z = all_z_data;
+
+    random_x = all_x_data + NUM_ELEMENTS * (bx - NUM_SETS);
+    random_y = all_y_data + NUM_ELEMENTS * (bx - NUM_SETS);
+    random_z = all_z_data + NUM_ELEMENTS * (bx - NUM_SETS);
+  } else {
+    random_x = all_x_data + NUM_ELEMENTS * (bx);
+    random_y = all_y_data + NUM_ELEMENTS * (bx);
+    random_z = all_z_data + NUM_ELEMENTS * (bx);
+
+    data_x = random_x;
+    data_y = random_y;
+    data_z = random_z;
+  }
+
   // Iterate over all random points
-  for(unsigned int j = 0; j < NUM_ELEMENTS; j += BLOCK_SIZE)
-    {
-      // load current random point values
-      float random_x_s;
-      float random_y_s;
-      float random_z_s;
-	  
-      if(tid + j < NUM_ELEMENTS)
-        {
-	  random_x_s = random_x[tid + j];
-	  random_y_s = random_y[tid + j];
-	  random_z_s = random_z[tid + j];
-	}
-
-      // Iterate over all data points
-      // If do_self, then use a tighter bound on the number of data points.
-      for(unsigned int k = 0;
-	  k < NUM_ELEMENTS && (do_self ? k < j + BLOCK_SIZE : 1); k++)
-	{
-	  // do actual calculations on the values:
-	  float distance = data_x[k] * random_x_s + 
-	    data_y[k] * random_y_s + 
-	    data_z[k] * random_z_s ;
-
-	  unsigned int bin_index;
-
-	  // run binary search to find bin_index
-	  unsigned int min = 0;
-	  unsigned int max = NUM_BINS;
-	  {
-	    unsigned int k2;
-	      
-	    while (max > min+1)
-	      {
-		k2 = (min + max) / 2;
-		if (distance >= binb[k2]) 
-		  max = k2;
-		else 
-		  min = k2;
-	      }
-	    bin_index = max - 1;
-	  }
-
-	  unsigned int warpnum = tid / (WARP_SIZE/HISTS_PER_WARP);
-	  if((distance < binb[min]) && (distance >= binb[max]) && 
-	     (!do_self || (tid + j > k)) && ((tid + j) < NUM_ELEMENTS))
-	    {
-	      __visc__atomic_add((int*)&(warp_hists(bin_index, warpnum)), 1);
-	    }
-	}
+  for (unsigned int j = 0; j < NUM_ELEMENTS; j += BLOCK_SIZE) {
+    // load current random point values
+    float random_x_s;
+    float random_y_s;
+    float random_z_s;
+
+    if (tid + j < NUM_ELEMENTS) {
+      random_x_s = random_x[tid + j];
+      random_y_s = random_y[tid + j];
+      random_z_s = random_z[tid + j];
     }
-    
+
+    // Iterate over all data points
+    // If do_self, then use a tighter bound on the number of data points.
+    for (unsigned int k = 0;
+         k < NUM_ELEMENTS && (do_self ? k < j + BLOCK_SIZE : 1); k++) {
+      // do actual calculations on the values:
+      float distance = data_x[k] * random_x_s + data_y[k] * random_y_s +
+                       data_z[k] * random_z_s;
+
+      unsigned int bin_index;
+
+      // run binary search to find bin_index
+      unsigned int min = 0;
+      unsigned int max = NUM_BINS;
+      {
+        unsigned int k2;
+
+        while (max > min + 1) {
+          k2 = (min + max) / 2;
+          if (distance >= binb[k2])
+            max = k2;
+          else
+            min = k2;
+        }
+        bin_index = max - 1;
+      }
+
+      unsigned int warpnum = tid / (WARP_SIZE / HISTS_PER_WARP);
+      if ((distance < binb[min]) && (distance >= binb[max]) &&
+          (!do_self || (tid + j > k)) && ((tid + j) < NUM_ELEMENTS)) {
+        __visc__atomic_add((int *)&(warp_hists(bin_index, warpnum)), 1);
+      }
+    }
+  }
+
   // coalesce the histograms in a block
-  unsigned int warp_index = tid & ( (NUM_HISTOGRAMS>>1) - 1);
-  unsigned int bin_index = tid / (NUM_HISTOGRAMS>>1);
-  for(unsigned int offset = NUM_HISTOGRAMS >> 1; offset > 0; 
-      offset >>= 1)
-    {
-      for(unsigned int bin_base = 0; bin_base < NUM_BINS; 
-	  bin_base += BLOCK_SIZE/ (NUM_HISTOGRAMS>>1))
-	{
-	  __visc__barrier();
-	  if(warp_index < offset && bin_base+bin_index < NUM_BINS )
-	    {
-	      unsigned long sum =
-		warp_hists(bin_base + bin_index, warp_index) + 
-		warp_hists(bin_base + bin_index, warp_index+offset);
-	      warp_hists(bin_base + bin_index, warp_index) = sum;
-	    }
-	}
+  unsigned int warp_index = tid & ((NUM_HISTOGRAMS >> 1) - 1);
+  unsigned int bin_index = tid / (NUM_HISTOGRAMS >> 1);
+  for (unsigned int offset = NUM_HISTOGRAMS >> 1; offset > 0; offset >>= 1) {
+    for (unsigned int bin_base = 0; bin_base < NUM_BINS;
+         bin_base += BLOCK_SIZE / (NUM_HISTOGRAMS >> 1)) {
+      __visc__barrier();
+      if (warp_index < offset && bin_base + bin_index < NUM_BINS) {
+        unsigned long sum =
+            warp_hists(bin_base + bin_index, warp_index) +
+            warp_hists(bin_base + bin_index, warp_index + offset);
+        warp_hists(bin_base + bin_index, warp_index) = sum;
+      }
     }
-    
+  }
+
   __visc__barrier();
-    
+
   // Put the results back in the real histogram
   // warp_hists(x, 0) holds sum of all locations of bin x
-  hist_t* hist_base = histograms + NUM_BINS * bx;
-  if(tid < NUM_BINS)
-    {
-      hist_base[tid] = warp_hists(tid, 0);
-    }
+  hist_t *hist_base = histograms + NUM_BINS * bx;
+  if (tid < NUM_BINS) {
+    hist_base[tid] = warp_hists(tid, 0);
+  }
 }
 
-void BlockingTPACF(hist_t* histograms, size_t bytes_histograms,
-                   float* all_x_data, size_t bytes_all_data,
+void BlockingTPACF(hist_t *histograms, size_t bytes_histograms,
+                   float *all_x_data, size_t bytes_all_data,
                    // next arg is read-only constant
-                   float* binb, size_t bytes_binb,
-                   int NUM_SETS, int NUM_ELEMENTS,
-                   long block) {
+                   float *binb, size_t bytes_binb, int NUM_SETS,
+                   int NUM_ELEMENTS, long block) {
 
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, all_x_data, binb, 1, histograms);
 
-  void* AllocationNode = __visc__createNodeND(0, Allocation);
-  void* TPACFLeafNode = __visc__createNodeND(1, TPACFLeaf, block);
+  void *AllocationNode = __visc__createNodeND(0, Allocation);
+  void *TPACFLeafNode = __visc__createNodeND(1, TPACFLeaf, block);
 
   // Bind Inputs
   __visc__bindIn(AllocationNode, 8, 0, 0); // Bind block
-  __visc__bindIn(TPACFLeafNode, 0, 0, 0); // Bind histograms
-  __visc__bindIn(TPACFLeafNode, 1, 1, 0); // Bind bytes_histograms
-  __visc__bindIn(TPACFLeafNode, 2, 2, 0); // Bind all_x_data
-  __visc__bindIn(TPACFLeafNode, 3, 3, 0); // Bind bytes_all_data
-  __visc__bindIn(TPACFLeafNode, 4, 4, 0); // Bind binb
-  __visc__bindIn(TPACFLeafNode, 5, 5, 0); // Bind bytes_binb
-  __visc__bindIn(TPACFLeafNode, 6, 6, 0); // Bind NUM_SETS
-  __visc__bindIn(TPACFLeafNode, 7, 7, 0); // Bind NUM_ELEMENTS
+  __visc__bindIn(TPACFLeafNode, 0, 0, 0);  // Bind histograms
+  __visc__bindIn(TPACFLeafNode, 1, 1, 0);  // Bind bytes_histograms
+  __visc__bindIn(TPACFLeafNode, 2, 2, 0);  // Bind all_x_data
+  __visc__bindIn(TPACFLeafNode, 3, 3, 0);  // Bind bytes_all_data
+  __visc__bindIn(TPACFLeafNode, 4, 4, 0);  // Bind binb
+  __visc__bindIn(TPACFLeafNode, 5, 5, 0);  // Bind bytes_binb
+  __visc__bindIn(TPACFLeafNode, 6, 6, 0);  // Bind NUM_SETS
+  __visc__bindIn(TPACFLeafNode, 7, 7, 0);  // Bind NUM_ELEMENTS
 
   // Create Edges
   __visc__edge(AllocationNode, TPACFLeafNode, 1, 0, 8, 0); // Edge warp_hists
-  __visc__edge(AllocationNode, TPACFLeafNode, 1, 1, 9, 0); // Edge bytes_warp_hists
-
+  __visc__edge(AllocationNode, TPACFLeafNode, 1, 1, 9,
+               0); // Edge bytes_warp_hists
 }
 
-void TPACFRoot(hist_t* histograms, size_t bytes_histograms,
-               float* all_x_data, size_t bytes_all_data,
+void TPACFRoot(hist_t *histograms, size_t bytes_histograms, float *all_x_data,
+               size_t bytes_all_data,
                // next arg is read-only constant
-               float* binb, size_t bytes_binb,
-               int NUM_SETS, int NUM_ELEMENTS,
-               long block,
-               long grid) {
+               float *binb, size_t bytes_binb, int NUM_SETS, int NUM_ELEMENTS,
+               long block, long grid) {
 
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, all_x_data, binb, 1, histograms);
 
-  void* BlockingTPACFNode = __visc__createNodeND(1, BlockingTPACF, grid);
+  void *BlockingTPACFNode = __visc__createNodeND(1, BlockingTPACF, grid);
 
   // Bind Inputs
   __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms
@@ -280,21 +251,17 @@ void TPACFRoot(hist_t* histograms, size_t bytes_histograms,
   __visc__bindIn(BlockingTPACFNode, 6, 6, 0); // Bind NUM_SETS
   __visc__bindIn(BlockingTPACFNode, 7, 7, 0); // Bind NUM_ELEMENTS
   __visc__bindIn(BlockingTPACFNode, 8, 8, 0); // Bind block
-
 }
 
-void TPACFWrapper(
-  hist_t* histograms, size_t bytes_histograms,
-  float* all_x_data, size_t bytes_all_data,
-  // next arg is read-only constant
-  float* binb, size_t bytes_binb,
-  int NUM_SETS, int NUM_ELEMENTS,
-  long block, long grid
-) {
+void TPACFWrapper(hist_t *histograms, size_t bytes_histograms,
+                  float *all_x_data, size_t bytes_all_data,
+                  // next arg is read-only constant
+                  float *binb, size_t bytes_binb, int NUM_SETS,
+                  int NUM_ELEMENTS, long block, long grid) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, all_x_data, binb, 1, histograms);
 
-  void* BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot);
+  void *BlockingTPACFNode = __visc__createNodeND(0, TPACFRoot);
 
   // Bind Inputs
   __visc__bindIn(BlockingTPACFNode, 0, 0, 0); // Bind histograms
@@ -311,18 +278,15 @@ void TPACFWrapper(
 
 // **===-----------------------------------------------------------===**
 
-int
-main( int argc, char** argv)
-{
+int main(int argc, char **argv) {
   struct pb_TimerSet timers;
-  struct pb_Parameters* params;
+  struct pb_Parameters *params;
 
-  params = pb_ReadParameters( &argc, argv );
+  params = pb_ReadParameters(&argc, argv);
 
   options args;
   parse_args(argc, argv, &args);
 
-
   NUM_ELEMENTS = args.npoints;
   NUM_SETS = args.random_count;
   int num_elements = NUM_ELEMENTS;
@@ -332,54 +296,50 @@ main( int argc, char** argv)
   printf("Bins per dec: %i\n", bins_per_dec);
   printf("Total bins  : %i\n", NUM_BINS);
 
-  //read in files
-  unsigned mem_size = (1+NUM_SETS)*num_elements*sizeof(struct cartesian);
-  unsigned f_mem_size = (1+NUM_SETS)*num_elements*sizeof(float);
+  // read in files
+  unsigned mem_size = (1 + NUM_SETS) * num_elements * sizeof(struct cartesian);
+  unsigned f_mem_size = (1 + NUM_SETS) * num_elements * sizeof(float);
 
   // container for all the points read from files
   struct cartesian *h_all_data;
-  h_all_data = (struct cartesian*) malloc(mem_size);
+  h_all_data = (struct cartesian *)malloc(mem_size);
   // Until I can get libs fixed
 
   // iterator for data files
   struct cartesian *working = h_all_data;
 
-
   // go through and read all data and random points into h_all_data
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   readdatafile(params->inpFiles[0], working, num_elements);
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
   working += num_elements;
-  for(int i = 0; i < (NUM_SETS); i++)
-  {
-    //pb_SwitchToTimer( &timers, pb_TimerID_IO );
-    readdatafile(params->inpFiles[i+1], working, num_elements);
-    //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  for (int i = 0; i < (NUM_SETS); i++) {
+    // pb_SwitchToTimer( &timers, pb_TimerID_IO );
+    readdatafile(params->inpFiles[i + 1], working, num_elements);
+    // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
 
     working += num_elements;
   }
 
-  pb_InitializeTimerSet( &timers );
+  pb_InitializeTimerSet(&timers);
   __visc__init();
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // split into x, y, and z arrays
   // AOS to SOA transformation
-  size_t bytes_h_x_data = 3*f_mem_size;
-  float * h_x_data = (float*) malloc (bytes_h_x_data);
+  size_t bytes_h_x_data = 3 * f_mem_size;
+  float *h_x_data = (float *)malloc(bytes_h_x_data);
   llvm_visc_track_mem(h_x_data, bytes_h_x_data);
 
-  float * h_y_data = h_x_data + NUM_ELEMENTS*(NUM_SETS+1);
-  float * h_z_data = h_y_data + NUM_ELEMENTS*(NUM_SETS+1);
-  for(int i = 0; i < (NUM_SETS+1); ++i)
-  {
-    for(int j = 0; j < NUM_ELEMENTS; ++j)
-    {
-      h_x_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].x;
-      h_y_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].y;
-      h_z_data[i*NUM_ELEMENTS+j] = h_all_data[i*NUM_ELEMENTS+j].z;
+  float *h_y_data = h_x_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  float *h_z_data = h_y_data + NUM_ELEMENTS * (NUM_SETS + 1);
+  for (int i = 0; i < (NUM_SETS + 1); ++i) {
+    for (int j = 0; j < NUM_ELEMENTS; ++j) {
+      h_x_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].x;
+      h_y_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].y;
+      h_z_data[i * NUM_ELEMENTS + j] = h_all_data[i * NUM_ELEMENTS + j].z;
     }
   }
 
@@ -387,77 +347,61 @@ main( int argc, char** argv)
   free(h_all_data);
 
   // allocate system memory for final histograms
-  size_t bytes_hists = NUM_BINS*(NUM_SETS*2+1)*sizeof(hist_t);
-  hist_t *hists = (hist_t *) malloc(bytes_hists);
+  size_t bytes_hists = NUM_BINS * (NUM_SETS * 2 + 1) * sizeof(hist_t);
+  hist_t *hists = (hist_t *)malloc(bytes_hists);
   llvm_visc_track_mem(hists, bytes_hists);
 
   // Initialize the boundary constants for bin search
-  size_t bytes_binb = (NUM_BINS+1)*sizeof(float);
-  float *binb = (float*)malloc(bytes_binb);
+  size_t bytes_binb = (NUM_BINS + 1) * sizeof(float);
+  float *binb = (float *)malloc(bytes_binb);
   llvm_visc_track_mem(binb, bytes_binb);
 
-  for (int k = 0; k < NUM_BINS+1; k++)
-  {
-    binb[k] = cos(pow(10.0, (log10(min_arcmin) + k*1.0/bins_per_dec))
-                  / 60.0*D2R);
+  for (int k = 0; k < NUM_BINS + 1; k++) {
+    binb[k] = cos(pow(10.0, (log10(min_arcmin) + k * 1.0 / bins_per_dec)) /
+                  60.0 * D2R);
   }
 
   // **===------------------ Kick off TPACF on OpenCL------------------===**
 
   long block = BLOCK_SIZE;
-  long grid = (NUM_SETS*2 + 1);
-
-  RootIn* graph_args = (RootIn*) malloc (sizeof(RootIn));
-  packData(graph_args,
-           hists,
-           bytes_hists,
-           h_x_data,
-           bytes_h_x_data,
-           binb,
-           bytes_binb,
-           NUM_SETS,
-           NUM_ELEMENTS,
-           block,
-           grid);
-  pb_SwitchToTimer( &timers, visc_TimerID_COMPUTATION );
-  
-  void* TPACF_DFG = __visc__launch(0, TPACFRoot, (void*)graph_args);
+  long grid = (NUM_SETS * 2 + 1);
+
+  RootIn *graph_args = (RootIn *)malloc(sizeof(RootIn));
+  packData(graph_args, hists, bytes_hists, h_x_data, bytes_h_x_data, binb,
+           bytes_binb, NUM_SETS, NUM_ELEMENTS, block, grid);
+  pb_SwitchToTimer(&timers, visc_TimerID_COMPUTATION);
+
+  void *TPACF_DFG = __visc__launch(0, TPACFRoot, (void *)graph_args);
   __visc__wait(TPACF_DFG);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COPY );
+  pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 
-  pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
   // **===-------------------------------------------------------------===**
 
   llvm_visc_request_mem(hists, bytes_hists);
   // references into output histograms
   hist_t *dd_hist = hists;
   hist_t *rr_hist = dd_hist + NUM_BINS;
-  hist_t *dr_hist = rr_hist + NUM_BINS*NUM_SETS;
+  hist_t *dr_hist = rr_hist + NUM_BINS * NUM_SETS;
 
   // add up values within dr and rr
   int rr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-  {
+  for (int i = 0; i < NUM_BINS; i++) {
     rr[i] = 0;
   }
-  for(int i=0; i<NUM_SETS; i++)
-  {
-    for(int j=0; j<NUM_BINS; j++)
-    {
-      rr[j] += rr_hist[i*NUM_BINS + j];
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      rr[j] += rr_hist[i * NUM_BINS + j];
     }
   }
   int dr[NUM_BINS];
-  for(int i=0; i<NUM_BINS; i++)
-  {
+  for (int i = 0; i < NUM_BINS; i++) {
     dr[i] = 0;
   }
-  for(int i=0; i<NUM_SETS; i++)
-  {
-    for(int j=0; j<NUM_BINS; j++)
-    {
-      dr[j] += dr_hist[i*NUM_BINS + j];
+  for (int i = 0; i < NUM_SETS; i++) {
+    for (int j = 0; j < NUM_BINS; j++) {
+      dr[j] += dr_hist[i * NUM_BINS + j];
     }
   }
 
@@ -466,27 +410,25 @@ main( int argc, char** argv)
   __visc__cleanup();
 
   FILE *outfile;
-  if ((outfile = fopen(params->outFile, "w")) == NULL)
-  {
-    fprintf(stderr, "Unable to open output file %s for writing, "
-            "assuming stdout\n", params->outFile);
+  if ((outfile = fopen(params->outFile, "w")) == NULL) {
+    fprintf(stderr,
+            "Unable to open output file %s for writing, "
+            "assuming stdout\n",
+            params->outFile);
     outfile = stdout;
   }
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_IO );
+  // pb_SwitchToTimer( &timers, pb_TimerID_IO );
   // print out final histograms + omega (while calculating omega)
-  for(int i=0; i<NUM_BINS; i++)
-  {
+  for (int i = 0; i < NUM_BINS; i++) {
     fprintf(outfile, "%d\n%d\n%d\n", dd_hist[i], dr[i], rr[i]);
   }
 
-  //pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
-  if(outfile != stdout)
+  // pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE );
+  if (outfile != stdout)
     fclose(outfile);
 
   // cleanup memory
   free(hists);
   free(h_x_data);
-
 }
-
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc
index 97e9e9eb5518a56eff4cc7c9da7d5ce6d9b69e0b..9e7139ac6f43104a9b7b85c1f6d538257d827ab2 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.cc
@@ -6,83 +6,75 @@
  *cr
  ***************************************************************************/
 #include <CL/cl.h>
-#include <sys/time.h>
-#include <stdio.h>
-#include <math.h>
-#include <strings.h>
 #include <math.h>
 #include <parboil.h>
+#include <stdio.h>
+#include <strings.h>
+#include <sys/time.h>
 
 #include "model.h"
 
 unsigned int NUM_SETS;
 unsigned int NUM_ELEMENTS;
 
-int readdatafile(char *fname, struct cartesian *data, int npoints)
-{
+int readdatafile(char *fname, struct cartesian *data, int npoints) {
   FILE *infile;
   int lcount = 0;
   float ra, dec;
 
-  if ((infile = fopen(fname, "r")) == NULL)
-    {
-      fprintf(stderr, "Unable to open data file %s for reading\n", fname);
-      return lcount;
-    }
+  if ((infile = fopen(fname, "r")) == NULL) {
+    fprintf(stderr, "Unable to open data file %s for reading\n", fname);
+    return lcount;
+  }
+
+  for (lcount = 0; lcount < npoints; lcount++) {
+    if (fscanf(infile, "%f %f", &ra, &dec) != 2)
+      break;
 
-  for (lcount = 0; lcount < npoints; lcount++)
     {
-      if (fscanf(infile, "%f %f", &ra, &dec) != 2)
-	break;
+      // data conversion
+      float rarad = D2R * ra;
+      float decrad = D2R * dec;
+      float cd = cos(decrad);
 
-      {
-        // data conversion
-        float rarad = D2R * ra;
-        float decrad = D2R * dec;
-        float cd = cos(decrad);
-	
-	data[lcount].x = cos(rarad) * cd;
-	data[lcount].y = sin(rarad) * cd;
-	data[lcount].z = sin(decrad);
-      }
+      data[lcount].x = cos(rarad) * cd;
+      data[lcount].y = sin(rarad) * cd;
+      data[lcount].z = sin(decrad);
     }
+  }
 
   fclose(infile);
-  
+
   return lcount;
 }
 
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error: Cannot open kernel file for reading!\n");
-                exit(1);
-        }
+char *readFile(const char *fileName) {
+  FILE *fp;
+  fp = fopen(fileName, "r");
+  if (fp == NULL) {
+    printf("Error: Cannot open kernel file for reading!\n");
+    exit(1);
+  }
 
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
+  fseek(fp, 0, SEEK_END);
+  long size = ftell(fp);
+  rewind(fp);
 
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error: Cannot allocated buffer for file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  char *buffer = (char *)malloc(sizeof(char) * (size + 1));
+  if (buffer == NULL) {
+    printf("Error: Cannot allocated buffer for file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error: Cannot read kernel file contents!\n");
-                fclose(fp);
-                exit(1);
-        }
+  size_t res = fread(buffer, 1, size, fp);
+  if (res != size) {
+    printf("Error: Cannot read kernel file contents!\n");
+    fclose(fp);
+    exit(1);
+  }
 
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
+  buffer[size] = 0;
+  fclose(fp);
+  return buffer;
 }
diff --git a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h
index 1a8c149aac15b39ed9ecaaecc8318582babb33f6..f9df468e542d4104fb52e9e6782c7b8a1736648d 100644
--- a/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h
+++ b/hpvm/test/parboil/benchmarks/tpacf/src/visc/model.h
@@ -8,9 +8,9 @@
 #ifndef __MODEL_H__
 #define __MODEL_H__
 
-#define D2R M_PI/180.0
-#define R2D 180.0/M_PI
-#define R2AM 60.0*180.0/M_PI
+#define D2R M_PI / 180.0
+#define R2D 180.0 / M_PI
+#define R2AM 60.0 * 180.0 / M_PI
 
 #define bins_per_dec 5
 #define min_arcmin 1.0
@@ -21,26 +21,23 @@
 
 typedef unsigned long hist_t;
 
-struct spherical 
-{
-  float ra, dec;  // latitude, longitude pair
+struct spherical {
+  float ra, dec; // latitude, longitude pair
 };
- 
-struct cartesian 
-{
-  float x, y, z;  // cartesian coodrinates
+
+struct cartesian {
+  float x, y, z; // cartesian coodrinates
 };
 
 int readdatafile(char *fname, struct cartesian *data, int npoints);
 
-char* readFile(const char*);
+char *readFile(const char *);
 
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
+#define CHECK_ERROR(errorMessage)                                              \
+  if (clStatus != CL_SUCCESS) {                                                \
+    printf("Error: %s!\n", errorMessage);                                      \
+    printf("Line: %d\n", __LINE__);                                            \
+    exit(1);                                                                   \
   }
 
 #endif
diff --git a/hpvm/test/parboil/common/include/parboil.h b/hpvm/test/parboil/common/include/parboil.h
index 41f78a99c07cea79e013eee86bc746f223517fdc..30ad6721c3190610dd08ec131603b6fe622f897e 100644
--- a/hpvm/test/parboil/common/include/parboil.h
+++ b/hpvm/test/parboil/common/include/parboil.h
@@ -12,13 +12,13 @@ extern "C" {
 
 /* Command line parameters for benchmarks */
 struct pb_Parameters {
-  char *outFile;		/* If not NULL, the raw output of the
-				 * computation should be saved to this
-				 * file. The string is owned. */
-  char **inpFiles;		/* A NULL-terminated array of strings
-				 * holding the input file(s) for the
-				 * computation.  The array and strings
-				 * are owned. */
+  char *outFile;   /* If not NULL, the raw output of the
+                    * computation should be saved to this
+                    * file. The string is owned. */
+  char **inpFiles; /* A NULL-terminated array of strings
+                    * holding the input file(s) for the
+                    * computation.  The array and strings
+                    * are owned. */
 };
 
 /* Read command-line parameters.
@@ -30,24 +30,21 @@ struct pb_Parameters {
  * If there is an error, then an error message is printed on stderr
  * and NULL is returned.
  */
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv);
+struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv);
 
 /* Free an instance of struct pb_Parameters.
  */
-void
-pb_FreeParameters(struct pb_Parameters *p);
+void pb_FreeParameters(struct pb_Parameters *p);
 
 /* Count the number of input files in a pb_Parameters instance.
  */
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p);
+int pb_Parameters_CountInputs(struct pb_Parameters *p);
 
 /* A time or duration. */
 #if _POSIX_VERSION >= 200112L
 typedef unsigned long long pb_Timestamp; /* time in microseconds */
 #else
-# error "Timestamps not implemented"
+#error "Timestamps not implemented"
 #endif
 
 enum pb_TimerState {
@@ -57,57 +54,53 @@ enum pb_TimerState {
 
 struct pb_Timer {
   enum pb_TimerState state;
-  pb_Timestamp elapsed;		/* Amount of time elapsed so far */
-  pb_Timestamp init;		/* Beginning of the current time interval,
-				 * if state is RUNNING.  End of the last 
-				 * recorded time interfal otherwise.  */
+  pb_Timestamp elapsed; /* Amount of time elapsed so far */
+  pb_Timestamp init;    /* Beginning of the current time interval,
+                         * if state is RUNNING.  End of the last
+                         * recorded time interfal otherwise.  */
 };
 
 /* Reset a timer.
  * Use this to initialize a timer or to clear
  * its elapsed time.  The reset timer is stopped.
  */
-void
-pb_ResetTimer(struct pb_Timer *timer);
+void pb_ResetTimer(struct pb_Timer *timer);
 
 /* Start a timer.  The timer is set to RUNNING mode and
  * time elapsed while the timer is running is added to
  * the timer.
  * The timer should not already be running.
  */
-void
-pb_StartTimer(struct pb_Timer *timer);
+void pb_StartTimer(struct pb_Timer *timer);
 
 /* Stop a timer.
  * This stops adding elapsed time to the timer.
  * The timer should not already be stopped.
  */
-void
-pb_StopTimer(struct pb_Timer *timer);
+void pb_StopTimer(struct pb_Timer *timer);
 
 /* Get the elapsed time in seconds. */
-double
-pb_GetElapsedTime(struct pb_Timer *timer);
+double pb_GetElapsedTime(struct pb_Timer *timer);
 
 /* Execution time is assigned to one of these categories. */
 enum pb_TimerID {
   pb_TimerID_NONE = 0,
-  pb_TimerID_IO,		/* Time spent in input/output */
-  pb_TimerID_KERNEL,		/* Time spent computing on the device, 
-				 * recorded asynchronously */
-  pb_TimerID_COPY,		/* Time spent synchronously moving data 
-				 * to/from device and allocating/freeing 
-				 * memory on the device */
-  pb_TimerID_DRIVER,		/* Time spent in the host interacting with the 
-				 * driver, primarily for recording the time 
-                                 * spent queueing asynchronous operations */
-  pb_TimerID_COPY_ASYNC,	/* Time spent in asynchronous transfers */
-  pb_TimerID_COMPUTE,		/* Time for all program execution other
-				 * than parsing command line arguments,
-				 * I/O, kernel, and copy */
-  pb_TimerID_OVERLAP,		/* Time double-counted in asynchronous and 
-				 * host activity: automatically filled in, 
-				 * not intended for direct usage */
+  pb_TimerID_IO,         /* Time spent in input/output */
+  pb_TimerID_KERNEL,     /* Time spent computing on the device,
+                          * recorded asynchronously */
+  pb_TimerID_COPY,       /* Time spent synchronously moving data
+                          * to/from device and allocating/freeing
+                          * memory on the device */
+  pb_TimerID_DRIVER,     /* Time spent in the host interacting with the
+                          * driver, primarily for recording the time
+                          * spent queueing asynchronous operations */
+  pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
+  pb_TimerID_COMPUTE,    /* Time for all program execution other
+                          * than parsing command line arguments,
+                          * I/O, kernel, and copy */
+  pb_TimerID_OVERLAP,    /* Time double-counted in asynchronous and
+                          * host activity: automatically filled in,
+                          * not intended for direct usage */
   // GPU FUNCTION
   visc_TimerID_INIT_CTX,
   visc_TimerID_CLEAR_CTX,
@@ -127,17 +120,17 @@ enum pb_TimerID {
   visc_TimerID_OUTPUT_PACK,
   visc_TimerID_OUTPUT_UNPACK,
 
-  pb_TimerID_LAST		/* Number of timer IDs */
+  pb_TimerID_LAST /* Number of timer IDs */
 };
 
 /* Dynamic list of asynchronously tracked times between events */
 struct pb_async_time_marker_list {
-  char *label; // actually just a pointer to a string
-  enum pb_TimerID timerID;	/* The ID to which the interval beginning 
-                                 * with this marker should be attributed */
-  void * marker; 
-  //cudaEvent_t marker; 		/* The driver event for this marker */
-  struct pb_async_time_marker_list *next; 
+  char *label;             // actually just a pointer to a string
+  enum pb_TimerID timerID; /* The ID to which the interval beginning
+                            * with this marker should be attributed */
+  void *marker;
+  // cudaEvent_t marker; 		/* The driver event for this marker */
+  struct pb_async_time_marker_list *next;
 };
 
 struct pb_SubTimer {
@@ -154,7 +147,7 @@ struct pb_SubTimerList {
 /* A set of timers for recording execution times. */
 struct pb_TimerSet {
   enum pb_TimerID current;
-  struct pb_async_time_marker_list* async_markers;
+  struct pb_async_time_marker_list *async_markers;
   pb_Timestamp async_begin;
   pb_Timestamp wall_begin;
   struct pb_Timer timers[pb_TimerID_LAST];
@@ -162,37 +155,33 @@ struct pb_TimerSet {
 };
 
 /* Reset all timers in the set. */
-void
-pb_InitializeTimerSet(struct pb_TimerSet *timers);
+void pb_InitializeTimerSet(struct pb_TimerSet *timers);
 
-void
-pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
+void pb_AddSubTimer(struct pb_TimerSet *timers, char *label,
+                    enum pb_TimerID pb_Category);
 
 /* Select which timer the next interval of time should be accounted
  * to. The selected timer is started and other timers are stopped.
  * Using pb_TimerID_NONE stops all timers. */
-void
-pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
+void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
 
-void
-pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
+void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label,
+                         enum pb_TimerID category);
 
 /* Print timer values to standard output. */
-void
-pb_PrintTimerSet(struct pb_TimerSet *timers);
+void pb_PrintTimerSet(struct pb_TimerSet *timers);
 
 /* Release timer resources */
-void
-pb_DestroyTimerSet(struct pb_TimerSet * timers);
+void pb_DestroyTimerSet(struct pb_TimerSet *timers);
 
-void
-pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
+void pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
 
-void
-pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKerenlPtr);
+void pb_CreateAndBuildKernelFromBinary(const char *file, const char *kernel,
+                                       void *clContextPtr, void *clDevicePtr,
+                                       void *clProgramPtr, void *clKerenlPtr);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif //PARBOIL_HEADER
+#endif // PARBOIL_HEADER
diff --git a/hpvm/test/parboil/common/include/visc.h b/hpvm/test/parboil/common/include/visc.h
index b0a0f141e575b104f0f3934416956cf9cd1f1904..6edc07a0a39353566d2b8edb72a2a39a83dba288 100644
--- a/hpvm/test/parboil/common/include/visc.h
+++ b/hpvm/test/parboil/common/include/visc.h
@@ -20,54 +20,55 @@ void __visc__hint(enum Target) noexcept;
 #endif
 
 #ifdef __cplusplus
-void* __visc__node(...) noexcept;
-//void* __visc__createNode(...) noexcept;
-//void* __visc__createNode1D(...) noexcept;
-//void* __visc__createNode2D(...) noexcept;
-//void* __visc__createNode3D(...) noexcept;
-//void __visc__return(...) noexcept;
+void *__visc__node(...) noexcept;
+// void* __visc__createNode(...) noexcept;
+// void* __visc__createNode1D(...) noexcept;
+// void* __visc__createNode2D(...) noexcept;
+// void* __visc__createNode3D(...) noexcept;
+// void __visc__return(...) noexcept;
 #endif
-void* __visc__createNodeND(unsigned, ...) noexcept;
+void *__visc__createNodeND(unsigned, ...) noexcept;
 void __visc__return(unsigned, ...) noexcept;
 
 void __visc__attributes(unsigned, ...) noexcept;
 void __visc__init() noexcept;
 void __visc__cleanup() noexcept;
 
-void __visc__bindIn(void*, unsigned, unsigned, unsigned) noexcept;
-void __visc__bindOut(void*, unsigned, unsigned, unsigned) noexcept;
-void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned) noexcept;
-void __visc__push(void*, void*) noexcept;
-void* __visc__pop(void*) noexcept;
-void* __visc__launch(unsigned, ...) noexcept;
-void __visc__wait(void*) noexcept;
-
-void* __visc__getNode() noexcept;
-void* __visc__getParentNode(void*) noexcept;
+void __visc__bindIn(void *, unsigned, unsigned, unsigned) noexcept;
+void __visc__bindOut(void *, unsigned, unsigned, unsigned) noexcept;
+void *__visc__edge(void *, void *, unsigned, unsigned, unsigned,
+                   unsigned) noexcept;
+void __visc__push(void *, void *) noexcept;
+void *__visc__pop(void *) noexcept;
+void *__visc__launch(unsigned, ...) noexcept;
+void __visc__wait(void *) noexcept;
+
+void *__visc__getNode() noexcept;
+void *__visc__getParentNode(void *) noexcept;
 void __visc__barrier() noexcept;
-void* __visc__malloc(long) noexcept;
-long __visc__getNodeInstanceID_x(void*) noexcept;
-long __visc__getNodeInstanceID_y(void*) noexcept;
-long __visc__getNodeInstanceID_z(void*) noexcept;
-long __visc__getNumNodeInstances_x(void*) noexcept;
-long __visc__getNumNodeInstances_y(void*) noexcept;
-long __visc__getNumNodeInstances_z(void*) noexcept;
+void *__visc__malloc(long) noexcept;
+long __visc__getNodeInstanceID_x(void *) noexcept;
+long __visc__getNodeInstanceID_y(void *) noexcept;
+long __visc__getNodeInstanceID_z(void *) noexcept;
+long __visc__getNumNodeInstances_x(void *) noexcept;
+long __visc__getNumNodeInstances_y(void *) noexcept;
+long __visc__getNumNodeInstances_z(void *) noexcept;
 
 // Atomic
 // signed int
-int __visc__atomic_cmpxchg(int*, int, int) noexcept;
-int __visc__atomic_add(int*, int) noexcept;
-int __visc__atomic_sub(int*, int) noexcept;
-int __visc__atomic_xchg(int*, int) noexcept;
-int __visc__atomic_inc(int*) noexcept;
-int __visc__atomic_dec(int*) noexcept;
-int __visc__atomic_min(int*, int) noexcept;
-int __visc__atomic_max(int*, int) noexcept;
-int __visc__atomic_umax(int*, int) noexcept;
-int __visc__atomic_umin(int*, int) noexcept;
-int __visc__atomic_and(int*, int) noexcept;
-int __visc__atomic_or(int*, int) noexcept;
-int __visc__atomic_xor(int*, int) noexcept;
+int __visc__atomic_cmpxchg(int *, int, int) noexcept;
+int __visc__atomic_add(int *, int) noexcept;
+int __visc__atomic_sub(int *, int) noexcept;
+int __visc__atomic_xchg(int *, int) noexcept;
+int __visc__atomic_inc(int *) noexcept;
+int __visc__atomic_dec(int *) noexcept;
+int __visc__atomic_min(int *, int) noexcept;
+int __visc__atomic_max(int *, int) noexcept;
+int __visc__atomic_umax(int *, int) noexcept;
+int __visc__atomic_umin(int *, int) noexcept;
+int __visc__atomic_and(int *, int) noexcept;
+int __visc__atomic_or(int *, int) noexcept;
+int __visc__atomic_xor(int *, int) noexcept;
 
 // Special Func
 float __visc__floor(float) noexcept;
@@ -76,18 +77,17 @@ float __visc__sqrt(float) noexcept;
 float __visc__sin(float) noexcept;
 float __visc__cos(float) noexcept;
 // unsigned int
-//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned) noexcept;
-//unsigned __visc__atomic_add(unsigned*, unsigned) noexcept;
-//unsigned __visc__atomic_sub(unsigned*, unsigned) noexcept;
-//unsigned __visc__atomic_xchg(unsigned*, unsigned) noexcept;
-//unsigned __visc__atomic_inc(unsigned*) noexcept;
-//unsigned __visc__atomic_dec(unsigned*) noexcept;
-//unsigned __visc__atomic_min(unsigned*, unsigned) noexcept;
-//unsigned __visc__atomic_max(unsigned*, unsigned) noexcept;
-//unsigned __visc__atomic_and(unsigned*, unsigned) noexcept;
-//unsigned __visc__atomic_or(unsigned*, unsigned) noexcept;
-//unsigned __visc__atomic_xor(unsigned*, unsigned) noexcept;
-
+// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned) noexcept;
+// unsigned __visc__atomic_add(unsigned*, unsigned) noexcept;
+// unsigned __visc__atomic_sub(unsigned*, unsigned) noexcept;
+// unsigned __visc__atomic_xchg(unsigned*, unsigned) noexcept;
+// unsigned __visc__atomic_inc(unsigned*) noexcept;
+// unsigned __visc__atomic_dec(unsigned*) noexcept;
+// unsigned __visc__atomic_min(unsigned*, unsigned) noexcept;
+// unsigned __visc__atomic_max(unsigned*, unsigned) noexcept;
+// unsigned __visc__atomic_and(unsigned*, unsigned) noexcept;
+// unsigned __visc__atomic_or(unsigned*, unsigned) noexcept;
+// unsigned __visc__atomic_xor(unsigned*, unsigned) noexcept;
 
 #include <unistd.h>
 
@@ -96,12 +96,10 @@ long get_group_id(int) noexcept;
 long get_local_id(int) noexcept;
 long get_local_size(int) noexcept;
 
-
-void llvm_visc_track_mem(void*, size_t) noexcept;
-void llvm_visc_untrack_mem(void*) noexcept;
-void llvm_visc_request_mem(void*, size_t) noexcept;
+void llvm_visc_track_mem(void *, size_t) noexcept;
+void llvm_visc_untrack_mem(void *) noexcept;
+void llvm_visc_request_mem(void *, size_t) noexcept;
 
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/hpvm/test/parboil/common/src/parboil.c b/hpvm/test/parboil/common/src/parboil.c
index 2115271c46a4012889b45fcbffda404068850c2a..bd8f453abbd3af6311fd8df48ae40de8f1183025 100644
--- a/hpvm/test/parboil/common/src/parboil.c
+++ b/hpvm/test/parboil/common/src/parboil.c
@@ -3,41 +3,41 @@
  */
 
 #include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 
 #if _POSIX_VERSION >= 200112L
 #include <time.h>
 #endif
 
-#define BILLION   1000000000LL
+#define BILLION 1000000000LL
 #define true 1
 
 /* Free an array of owned strings. */
-static void
-free_string_array(char **string_array)
-{
+static void free_string_array(char **string_array) {
   char **p;
 
-  if (!string_array) return;
-  for (p = string_array; *p; p++) free(*p);
+  if (!string_array)
+    return;
+  for (p = string_array; *p; p++)
+    free(*p);
   free(string_array);
 }
 
 /* Parse a comma-delimited list of strings into an
  * array of strings. */
-static char ** 
-read_string_array(char *in)
-{
+static char **read_string_array(char *in) {
   char **ret;
   int i;
-  int count;			/* Number of items in the input */
-  char *substring;		/* Current substring within 'in' */
+  int count;       /* Number of items in the input */
+  char *substring; /* Current substring within 'in' */
 
   /* Count the number of items in the string */
   count = 1;
-  for (i = 0; in[i]; i++) if (in[i] == ',') count++;
+  for (i = 0; in[i]; i++)
+    if (in[i] == ',')
+      count++;
 
   /* Allocate storage */
   ret = (char **)malloc((count + 1) * sizeof(char *));
@@ -50,8 +50,8 @@ read_string_array(char *in)
 
     /* Find length of substring */
     for (substring_end = substring;
-	 (*substring_end != ',') && (*substring_end != 0);
-	 substring_end++);
+         (*substring_end != ',') && (*substring_end != 0); substring_end++)
+      ;
 
     substring_length = substring_end - substring;
 
@@ -63,41 +63,35 @@ read_string_array(char *in)
     /* go to next substring */
     substring = substring_end + 1;
   }
-  ret[i] = NULL;		/* Write the sentinel value */
+  ret[i] = NULL; /* Write the sentinel value */
 
   return ret;
 }
 
 struct argparse {
-  int argc;			/* Number of arguments.  Mutable. */
-  char **argv;			/* Argument values.  Immutable. */
+  int argc;    /* Number of arguments.  Mutable. */
+  char **argv; /* Argument values.  Immutable. */
 
-  int argn;			/* Current argument number. */
-  char **argv_get;		/* Argument value being read. */
-  char **argv_put;		/* Argument value being written.
-				 * argv_put <= argv_get. */
+  int argn;        /* Current argument number. */
+  char **argv_get; /* Argument value being read. */
+  char **argv_put; /* Argument value being written.
+                    * argv_put <= argv_get. */
 };
 
-static void
-initialize_argparse(struct argparse *ap, int argc, char **argv)
-{
+static void initialize_argparse(struct argparse *ap, int argc, char **argv) {
   ap->argc = argc;
   ap->argn = 0;
   ap->argv_get = ap->argv_put = ap->argv = argv;
 }
 
-static void
-finalize_argparse(struct argparse *ap)
-{
+static void finalize_argparse(struct argparse *ap) {
   /* Move the remaining arguments */
-  for(; ap->argn < ap->argc; ap->argn++)
+  for (; ap->argn < ap->argc; ap->argn++)
     *ap->argv_put++ = *ap->argv_get++;
 }
 
 /* Delete the current argument. */
-static void
-delete_argument(struct argparse *ap)
-{
+static void delete_argument(struct argparse *ap) {
   if (ap->argn >= ap->argc) {
     fprintf(stderr, "delete_argument\n");
   }
@@ -107,9 +101,7 @@ delete_argument(struct argparse *ap)
 
 /* Go to the next argument.  Also, move the current argument to its
  * final location in argv. */
-static void
-next_argument(struct argparse *ap)
-{
+static void next_argument(struct argparse *ap) {
   if (ap->argn >= ap->argc) {
     fprintf(stderr, "next_argument\n");
   }
@@ -118,33 +110,23 @@ next_argument(struct argparse *ap)
   ap->argn++;
 }
 
-static int
-is_end_of_arguments(struct argparse *ap)
-{
+static int is_end_of_arguments(struct argparse *ap) {
   return ap->argn == ap->argc;
 }
 
-static char *
-get_argument(struct argparse *ap)
-{
-  return *ap->argv_get;
-}
+static char *get_argument(struct argparse *ap) { return *ap->argv_get; }
 
-static char *
-consume_argument(struct argparse *ap)
-{
+static char *consume_argument(struct argparse *ap) {
   char *ret = get_argument(ap);
   delete_argument(ap);
   return ret;
 }
 
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv)
-{
+struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv) {
   char *err_message;
   struct argparse ap;
   struct pb_Parameters *ret =
-    (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
+      (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
 
   /* Initialize the parameters structure */
   ret->outFile = NULL;
@@ -153,59 +135,54 @@ pb_ReadParameters(int *_argc, char **argv)
 
   /* Each argument */
   initialize_argparse(&ap, *_argc, argv);
-  while(!is_end_of_arguments(&ap)) {
+  while (!is_end_of_arguments(&ap)) {
     char *arg = get_argument(&ap);
 
     /* Single-character flag */
     if ((arg[0] == '-') && (arg[1] != 0) && (arg[2] == 0)) {
-      delete_argument(&ap);	/* This argument is consumed here */
-
-      switch(arg[1]) {
-      case 'o':			/* Output file name */
-	if (is_end_of_arguments(&ap))
-	  {
-	    err_message = "Expecting file name after '-o'\n";
-	    goto error;
-	  }
-	free(ret->outFile);
-	ret->outFile = strdup(consume_argument(&ap));
-	break;
-      case 'i':			/* Input file name */
-	if (is_end_of_arguments(&ap))
-	  {
-	    err_message = "Expecting file name after '-i'\n";
-	    goto error;
-	  }
-	ret->inpFiles = read_string_array(consume_argument(&ap));
-	break;
-      case '-':			/* End of options */
-	goto end_of_options;
+      delete_argument(&ap); /* This argument is consumed here */
+
+      switch (arg[1]) {
+      case 'o': /* Output file name */
+        if (is_end_of_arguments(&ap)) {
+          err_message = "Expecting file name after '-o'\n";
+          goto error;
+        }
+        free(ret->outFile);
+        ret->outFile = strdup(consume_argument(&ap));
+        break;
+      case 'i': /* Input file name */
+        if (is_end_of_arguments(&ap)) {
+          err_message = "Expecting file name after '-i'\n";
+          goto error;
+        }
+        ret->inpFiles = read_string_array(consume_argument(&ap));
+        break;
+      case '-': /* End of options */
+        goto end_of_options;
       default:
-	err_message = "Unexpected command-line parameter\n";
-	goto error;
+        err_message = "Unexpected command-line parameter\n";
+        goto error;
       }
-    }
-    else {
+    } else {
       /* Other parameters are ignored */
       next_argument(&ap);
     }
   } /* end for each argument */
 
- end_of_options:
-  *_argc = ap.argc;		/* Save the modified argc value */
+end_of_options:
+  *_argc = ap.argc; /* Save the modified argc value */
   finalize_argparse(&ap);
 
   return ret;
 
- error:
+error:
   fputs(err_message, stderr);
   pb_FreeParameters(ret);
   return NULL;
 }
 
-void
-pb_FreeParameters(struct pb_Parameters *p)
-{
+void pb_FreeParameters(struct pb_Parameters *p) {
   char **cpp;
 
   free(p->outFile);
@@ -213,56 +190,47 @@ pb_FreeParameters(struct pb_Parameters *p)
   free(p);
 }
 
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p)
-{
+int pb_Parameters_CountInputs(struct pb_Parameters *p) {
   int n;
 
-  for (n = 0; p->inpFiles[n]; n++);
+  for (n = 0; p->inpFiles[n]; n++)
+    ;
   return n;
 }
 
 /*****************************************************************************/
 /* Timer routines */
 
-static void
-accumulate_time(pb_Timestamp *accum,
-		pb_Timestamp start,
-		pb_Timestamp end)
-{
+static void accumulate_time(pb_Timestamp *accum, pb_Timestamp start,
+                            pb_Timestamp end) {
 #if _POSIX_VERSION >= 200112L
   *accum += end - start;
 #else
-# error "Timestamps not implemented for this system"
+#error "Timestamps not implemented for this system"
 #endif
 }
 
 #if _POSIX_VERSION >= 200112L
-static pb_Timestamp get_time()
-{
+static pb_Timestamp get_time() {
   struct timespec tv;
   clock_gettime(CLOCK_MONOTONIC, &tv);
-  return (pb_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec);
+  return (pb_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec);
 }
 #else
-# error "no supported time libraries are available on this platform"
+#error "no supported time libraries are available on this platform"
 #endif
 
-void
-pb_ResetTimer(struct pb_Timer *timer)
-{
+void pb_ResetTimer(struct pb_Timer *timer) {
   timer->state = pb_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   timer->elapsed = 0;
 #else
-# error "pb_ResetTimer: not implemented for this system"
+#error "pb_ResetTimer: not implemented for this system"
 #endif
 }
 
-void
-pb_StartTimer(struct pb_Timer *timer)
-{
+void pb_StartTimer(struct pb_Timer *timer) {
   if (timer->state != pb_Timer_STOPPED) {
     fputs("Ignoring attempt to start a running timer\n", stderr);
     return;
@@ -277,13 +245,12 @@ pb_StartTimer(struct pb_Timer *timer)
     timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "pb_StartTimer: not implemented for this system"
+#error "pb_StartTimer: not implemented for this system"
 #endif
 }
 
-void
-pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
-{
+void pb_StartTimerAndSubTimer(struct pb_Timer *timer,
+                              struct pb_Timer *subtimer) {
   unsigned int numNotStopped = 0x3; // 11
   if (timer->state != pb_Timer_STOPPED) {
     fputs("Warning: Timer was not stopped\n", stderr);
@@ -305,24 +272,21 @@ pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
   {
     struct timespec tv;
     clock_gettime(CLOCK_MONOTONIC, &tv);
-    
+
     if (numNotStopped & 0x2) {
       timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
     }
-  
+
     if (numNotStopped & 0x1) {
       subtimer->init = tv.tv_sec * BILLION + tv.tv_nsec;
     }
   }
 #else
-# error "pb_StartTimer: not implemented for this system"
+#error "pb_StartTimer: not implemented for this system"
 #endif
-
 }
 
-void
-pb_StopTimer(struct pb_Timer *timer)
-{
+void pb_StopTimer(struct pb_Timer *timer) {
 
   pb_Timestamp fini;
 
@@ -340,15 +304,15 @@ pb_StopTimer(struct pb_Timer *timer)
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "pb_StopTimer: not implemented for this system"
+#error "pb_StopTimer: not implemented for this system"
 #endif
 
   accumulate_time(&timer->elapsed, timer->init, fini);
   timer->init = fini;
-
 }
 
-void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) {
+void pb_StopTimerAndSubTimer(struct pb_Timer *timer,
+                             struct pb_Timer *subtimer) {
 
   pb_Timestamp fini;
 
@@ -366,7 +330,6 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     return;
   }
 
-
   timer->state = pb_Timer_STOPPED;
   subtimer->state = pb_Timer_STOPPED;
 
@@ -377,25 +340,22 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "pb_StopTimer: not implemented for this system"
+#error "pb_StopTimer: not implemented for this system"
 #endif
 
   if (numNotRunning & 0x2) {
     accumulate_time(&timer->elapsed, timer->init, fini);
     timer->init = fini;
   }
-  
+
   if (numNotRunning & 0x1) {
     accumulate_time(&subtimer->elapsed, subtimer->init, fini);
     subtimer->init = fini;
   }
-
 }
 
 /* Get the elapsed time in seconds. */
-double
-pb_GetElapsedTime(struct pb_Timer *timer)
-{
+double pb_GetElapsedTime(struct pb_Timer *timer) {
   double ret;
 
   if (timer->state != pb_Timer_STOPPED) {
@@ -405,22 +365,19 @@ pb_GetElapsedTime(struct pb_Timer *timer)
 #if _POSIX_VERSION >= 200112L
   ret = timer->elapsed / 1e9;
 #else
-# error "pb_GetElapsedTime: not implemented for this system"
+#error "pb_GetElapsedTime: not implemented for this system"
 #endif
   return ret;
 }
 
-void
-pb_InitializeTimerSet(struct pb_TimerSet *timers)
-{
+void pb_InitializeTimerSet(struct pb_TimerSet *timers) {
   int n;
-  
+
   timers->wall_begin = get_time();
 
   timers->current = pb_TimerID_NONE;
 
   timers->async_markers = NULL;
-  
 
   for (n = 0; n < pb_TimerID_LAST; n++) {
     pb_ResetTimer(&timers->timers[n]);
@@ -428,24 +385,24 @@ pb_InitializeTimerSet(struct pb_TimerSet *timers)
   }
 }
 
-void
-pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) {  
-  
-  struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc
-    (sizeof(struct pb_SubTimer));
-    
+void pb_AddSubTimer(struct pb_TimerSet *timers, char *label,
+                    enum pb_TimerID pb_Category) {
+
+  struct pb_SubTimer *subtimer =
+      (struct pb_SubTimer *)malloc(sizeof(struct pb_SubTimer));
+
   int len = strlen(label);
-    
-  subtimer->label = (char *) malloc (sizeof(char)*(len+1));
+
+  subtimer->label = (char *)malloc(sizeof(char) * (len + 1));
   sprintf(subtimer->label, "%s\0", label);
-  
+
   pb_ResetTimer(&subtimer->timer);
   subtimer->next = NULL;
-  
+
   struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category];
   if (subtimerlist == NULL) {
-    subtimerlist = (struct pb_SubTimerList *) malloc
-      (sizeof(struct pb_SubTimerList));
+    subtimerlist =
+        (struct pb_SubTimerList *)malloc(sizeof(struct pb_SubTimerList));
     subtimerlist->subtimer_list = subtimer;
     timers->sub_timer_list[pb_Category] = subtimerlist;
   } else {
@@ -456,28 +413,30 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ
     }
     element->next = subtimer;
   }
-  
 }
 
-void
-pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category)
-{
+void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label,
+                         enum pb_TimerID category) {
+
+  // switchToSub( NULL, NONE
+  // switchToSub( NULL, some
+  // switchToSub( some, some
+  // switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which
+  // won't be printed
 
-// switchToSub( NULL, NONE
-// switchToSub( NULL, some
-// switchToSub( some, some
-// switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which won't be printed
-  
   struct pb_Timer *topLevelToStop = NULL;
   if (timers->current != category && timers->current != pb_TimerID_NONE) {
-    // Switching to subtimer in a different category needs to stop the top-level current, different categoried timer.
-    // NONE shouldn't have a timer associated with it, so exclude from branch
+    // Switching to subtimer in a different category needs to stop the top-level
+    // current, different categoried timer. NONE shouldn't have a timer
+    // associated with it, so exclude from branch
     topLevelToStop = &timers->timers[timers->current];
-  } 
+  }
+
+  struct pb_SubTimerList *subtimerlist =
+      timers->sub_timer_list[timers->current];
+  struct pb_SubTimer *curr =
+      (subtimerlist == NULL) ? NULL : subtimerlist->current;
 
-  struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-  struct pb_SubTimer *curr = (subtimerlist == NULL) ? NULL : subtimerlist->current;
-  
   if (timers->current != pb_TimerID_NONE) {
     if (curr != NULL && topLevelToStop != NULL) {
       pb_StopTimerAndSubTimer(topLevelToStop, &curr->timer);
@@ -487,11 +446,11 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
       pb_StopTimer(topLevelToStop);
     }
   }
-  
+
   subtimerlist = timers->sub_timer_list[category];
   struct pb_SubTimer *subtimer = NULL;
-  
-  if (label != NULL) {  
+
+  if (label != NULL) {
     subtimer = subtimerlist->subtimer_list;
     while (subtimer != NULL) {
       if (strcmp(subtimer->label, label) == 0) {
@@ -500,48 +459,47 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
         subtimer = subtimer->next;
       }
     }
-  }  
-  
+  }
+
   if (category != pb_TimerID_NONE) {
-    
+
     if (subtimerlist != NULL) {
       subtimerlist->current = subtimer;
     }
-    
+
     if (category != timers->current && subtimer != NULL) {
       pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
     } else if (subtimer != NULL) {
       // Same category, different non-NULL subtimer
       pb_StartTimer(&subtimer->timer);
-    } else{
-      // Different category, but no subtimer (not found or specified as NULL) -- unprefered way of setting topLevel timer
+    } else {
+      // Different category, but no subtimer (not found or specified as NULL) --
+      // unprefered way of setting topLevel timer
       pb_StartTimer(&timers->timers[category]);
     }
-  }  
-  
+  }
+
   timers->current = category;
-  
 }
 
-void
-pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
-{
-  if(timer == pb_TimerID_KERNEL)
+void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) {
+  if (timer == pb_TimerID_KERNEL)
     printf("In parboil.c\n");
   /* Stop the currently running timer */
   if (timers->current != pb_TimerID_NONE) {
     struct pb_SubTimer *currSubTimer = NULL;
-    struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-    
-    if ( subtimerlist != NULL) {
+    struct pb_SubTimerList *subtimerlist =
+        timers->sub_timer_list[timers->current];
+
+    if (subtimerlist != NULL) {
       currSubTimer = timers->sub_timer_list[timers->current]->current;
     }
-    if ( currSubTimer!= NULL) {
-      pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
+    if (currSubTimer != NULL) {
+      pb_StopTimerAndSubTimer(&timers->timers[timers->current],
+                              &currSubTimer->timer);
     } else {
       pb_StopTimer(&timers->timers[timers->current]);
     }
-    
   }
 
   timers->current = timer;
@@ -551,40 +509,39 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
   }
 }
 
-void
-pb_PrintTimerSet(struct pb_TimerSet *timers)
-{
+void pb_PrintTimerSet(struct pb_TimerSet *timers) {
   printf("Printing Parboil Timer: Default\n");
 
   pb_Timestamp wall_end = get_time();
 
   struct pb_Timer *t = timers->timers;
-  struct pb_SubTimer* sub = NULL;
-  
+  struct pb_SubTimer *sub = NULL;
+
   int maxSubLength;
-    
-//  const char *categories[] = {
-//    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
-//  };
 
-  const char *categories[] = {
-    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap",
-    "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free",
-    "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc",
-    "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack"
-  };
+  //  const char *categories[] = {
+  //    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
+  //  };
 
+  const char *categories[] = {
+      "IO",          "Kernel",         "Copy",         "Driver",
+      "Copy Async",  "Compute",        "Overlap",      "Init_Ctx",
+      "Clear_Ctx",   "Copy_Scalar",    "Copy_Ptr",     "Mem_Free",
+      "Read_Output", "Setup",          "Mem_Track",    "Mem_Untrack",
+      "Misc",        "Pthread_Create", "Arg_Pack",     "Arg_Unpack",
+      "Computation", "Output_Pack",    "Output_Unpack"};
 
-  
   const int maxCategoryLength = 20;
-  
+
   int i;
-  for(i = 1; i < pb_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format
-    if(pb_GetElapsedTime(&t[i]) != 0 || true) {
-    
+  for (i = 1; i < pb_TimerID_LAST;
+       ++i) { // exclude NONE and OVRELAP from this format
+    if (pb_GetElapsedTime(&t[i]) != 0 || true) {
+
       // Print Category Timer
-      printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i]));
-      
+      printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1],
+             pb_GetElapsedTime(&t[i]));
+
       if (timers->sub_timer_list[i] != NULL) {
         sub = timers->sub_timer_list[i]->subtimer_list;
         maxSubLength = 0;
@@ -595,44 +552,44 @@ pb_PrintTimerSet(struct pb_TimerSet *timers)
           }
           sub = sub->next;
         }
-        
+
         // Fit to Categories
         if (maxSubLength <= maxCategoryLength) {
-         maxSubLength = maxCategoryLength;
+          maxSubLength = maxCategoryLength;
         }
-        
+
         sub = timers->sub_timer_list[i]->subtimer_list;
-        
+
         // Print SubTimers
         while (sub != NULL) {
-          printf(" -%-*s: %.9f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer));
+          printf(" -%-*s: %.9f\n", maxSubLength, sub->label,
+                 pb_GetElapsedTime(&sub->timer));
           sub = sub->next;
         }
       }
     }
   }
-  
-  if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
-    printf("CPU/Kernel Overlap: %.9f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
-        
-  float walltime = (wall_end - timers->wall_begin)/ 1e9;
-  printf("Timer Wall Time: %.9f\n", walltime);  
-  
+
+  if (pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
+    printf("CPU/Kernel Overlap: %.9f\n",
+           pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
+
+  float walltime = (wall_end - timers->wall_begin) / 1e9;
+  printf("Timer Wall Time: %.9f\n", walltime);
 }
 
-void pb_DestroyTimerSet(struct pb_TimerSet * timers)
-{
+void pb_DestroyTimerSet(struct pb_TimerSet *timers) {
   /* clean up all of the async event markers */
-  struct pb_async_time_marker_list ** event = &(timers->async_markers);
-  while( *event != NULL) {
-    struct pb_async_time_marker_list ** next = &((*event)->next);
+  struct pb_async_time_marker_list **event = &(timers->async_markers);
+  while (*event != NULL) {
+    struct pb_async_time_marker_list **next = &((*event)->next);
     free(*event);
     (*event) = NULL;
     event = next;
   }
-  
+
   int i = 0;
-  for(i = 0; i < pb_TimerID_LAST; ++i) {    
+  for (i = 0; i < pb_TimerID_LAST; ++i) {
     if (timers->sub_timer_list[i] != NULL) {
       struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
       struct pb_SubTimer *prev = NULL;
@@ -646,5 +603,3 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers)
     }
   }
 }
-
-
diff --git a/hpvm/test/parboil/common/src/parboil_cuda.c b/hpvm/test/parboil/common/src/parboil_cuda.c
index d1bf554cc3219e20ce4bc0e76c6acfdd0091a9a7..9fd64661643c9afec5cb470beaa516d545017bd3 100644
--- a/hpvm/test/parboil/common/src/parboil_cuda.c
+++ b/hpvm/test/parboil/common/src/parboil_cuda.c
@@ -3,9 +3,9 @@
  */
 
 #include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
 #ifndef __MCUDA__
 #include <cuda_runtime_api.h>
 #else
@@ -13,35 +13,35 @@
 #endif
 
 #if _POSIX_VERSION >= 200112L
-# include <sys/time.h>
+#include <sys/time.h>
 #endif
 
 #define true 1
 
 /* Free an array of owned strings. */
-static void
-free_string_array(char **string_array)
-{
+static void free_string_array(char **string_array) {
   char **p;
 
-  if (!string_array) return;
-  for (p = string_array; *p; p++) free(*p);
+  if (!string_array)
+    return;
+  for (p = string_array; *p; p++)
+    free(*p);
   free(string_array);
 }
 
 /* Parse a comma-delimited list of strings into an
  * array of strings. */
-static char ** 
-read_string_array(char *in)
-{
+static char **read_string_array(char *in) {
   char **ret;
   int i;
-  int count;			/* Number of items in the input */
-  char *substring;		/* Current substring within 'in' */
+  int count;       /* Number of items in the input */
+  char *substring; /* Current substring within 'in' */
 
   /* Count the number of items in the string */
   count = 1;
-  for (i = 0; in[i]; i++) if (in[i] == ',') count++;
+  for (i = 0; in[i]; i++)
+    if (in[i] == ',')
+      count++;
 
   /* Allocate storage */
   ret = (char **)malloc((count + 1) * sizeof(char *));
@@ -54,8 +54,8 @@ read_string_array(char *in)
 
     /* Find length of substring */
     for (substring_end = substring;
-	 (*substring_end != ',') && (*substring_end != 0);
-	 substring_end++);
+         (*substring_end != ',') && (*substring_end != 0); substring_end++)
+      ;
 
     substring_length = substring_end - substring;
 
@@ -67,41 +67,35 @@ read_string_array(char *in)
     /* go to next substring */
     substring = substring_end + 1;
   }
-  ret[i] = NULL;		/* Write the sentinel value */
+  ret[i] = NULL; /* Write the sentinel value */
 
   return ret;
 }
 
 struct argparse {
-  int argc;			/* Number of arguments.  Mutable. */
-  char **argv;			/* Argument values.  Immutable. */
+  int argc;    /* Number of arguments.  Mutable. */
+  char **argv; /* Argument values.  Immutable. */
 
-  int argn;			/* Current argument number. */
-  char **argv_get;		/* Argument value being read. */
-  char **argv_put;		/* Argument value being written.
-				 * argv_put <= argv_get. */
+  int argn;        /* Current argument number. */
+  char **argv_get; /* Argument value being read. */
+  char **argv_put; /* Argument value being written.
+                    * argv_put <= argv_get. */
 };
 
-static void
-initialize_argparse(struct argparse *ap, int argc, char **argv)
-{
+static void initialize_argparse(struct argparse *ap, int argc, char **argv) {
   ap->argc = argc;
   ap->argn = 0;
   ap->argv_get = ap->argv_put = ap->argv = argv;
 }
 
-static void
-finalize_argparse(struct argparse *ap)
-{
+static void finalize_argparse(struct argparse *ap) {
   /* Move the remaining arguments */
-  for(; ap->argn < ap->argc; ap->argn++)
+  for (; ap->argn < ap->argc; ap->argn++)
     *ap->argv_put++ = *ap->argv_get++;
 }
 
 /* Delete the current argument. */
-static void
-delete_argument(struct argparse *ap)
-{
+static void delete_argument(struct argparse *ap) {
   if (ap->argn >= ap->argc) {
     fprintf(stderr, "delete_argument\n");
   }
@@ -111,9 +105,7 @@ delete_argument(struct argparse *ap)
 
 /* Go to the next argument.  Also, move the current argument to its
  * final location in argv. */
-static void
-next_argument(struct argparse *ap)
-{
+static void next_argument(struct argparse *ap) {
   if (ap->argn >= ap->argc) {
     fprintf(stderr, "next_argument\n");
   }
@@ -122,33 +114,23 @@ next_argument(struct argparse *ap)
   ap->argn++;
 }
 
-static int
-is_end_of_arguments(struct argparse *ap)
-{
+static int is_end_of_arguments(struct argparse *ap) {
   return ap->argn == ap->argc;
 }
 
-static char *
-get_argument(struct argparse *ap)
-{
-  return *ap->argv_get;
-}
+static char *get_argument(struct argparse *ap) { return *ap->argv_get; }
 
-static char *
-consume_argument(struct argparse *ap)
-{
+static char *consume_argument(struct argparse *ap) {
   char *ret = get_argument(ap);
   delete_argument(ap);
   return ret;
 }
 
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv)
-{
+struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv) {
   char *err_message;
   struct argparse ap;
   struct pb_Parameters *ret =
-    (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
+      (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
 
   /* Initialize the parameters structure */
   ret->outFile = NULL;
@@ -157,59 +139,54 @@ pb_ReadParameters(int *_argc, char **argv)
 
   /* Each argument */
   initialize_argparse(&ap, *_argc, argv);
-  while(!is_end_of_arguments(&ap)) {
+  while (!is_end_of_arguments(&ap)) {
     char *arg = get_argument(&ap);
 
     /* Single-character flag */
     if ((arg[0] == '-') && (arg[1] != 0) && (arg[2] == 0)) {
-      delete_argument(&ap);	/* This argument is consumed here */
-
-      switch(arg[1]) {
-      case 'o':			/* Output file name */
-	if (is_end_of_arguments(&ap))
-	  {
-	    err_message = "Expecting file name after '-o'\n";
-	    goto error;
-	  }
-	free(ret->outFile);
-	ret->outFile = strdup(consume_argument(&ap));
-	break;
-      case 'i':			/* Input file name */
-	if (is_end_of_arguments(&ap))
-	  {
-	    err_message = "Expecting file name after '-i'\n";
-	    goto error;
-	  }
-	ret->inpFiles = read_string_array(consume_argument(&ap));
-	break;
-      case '-':			/* End of options */
-	goto end_of_options;
+      delete_argument(&ap); /* This argument is consumed here */
+
+      switch (arg[1]) {
+      case 'o': /* Output file name */
+        if (is_end_of_arguments(&ap)) {
+          err_message = "Expecting file name after '-o'\n";
+          goto error;
+        }
+        free(ret->outFile);
+        ret->outFile = strdup(consume_argument(&ap));
+        break;
+      case 'i': /* Input file name */
+        if (is_end_of_arguments(&ap)) {
+          err_message = "Expecting file name after '-i'\n";
+          goto error;
+        }
+        ret->inpFiles = read_string_array(consume_argument(&ap));
+        break;
+      case '-': /* End of options */
+        goto end_of_options;
       default:
-	err_message = "Unexpected command-line parameter\n";
-	goto error;
+        err_message = "Unexpected command-line parameter\n";
+        goto error;
       }
-    }
-    else {
+    } else {
       /* Other parameters are ignored */
       next_argument(&ap);
     }
   } /* end for each argument */
 
- end_of_options:
-  *_argc = ap.argc;		/* Save the modified argc value */
+end_of_options:
+  *_argc = ap.argc; /* Save the modified argc value */
   finalize_argparse(&ap);
 
   return ret;
 
- error:
+error:
   fputs(err_message, stderr);
   pb_FreeParameters(ret);
   return NULL;
 }
 
-void
-pb_FreeParameters(struct pb_Parameters *p)
-{
+void pb_FreeParameters(struct pb_Parameters *p) {
   char **cpp;
 
   free(p->outFile);
@@ -217,61 +194,54 @@ pb_FreeParameters(struct pb_Parameters *p)
   free(p);
 }
 
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p)
-{
+int pb_Parameters_CountInputs(struct pb_Parameters *p) {
   int n;
 
-  for (n = 0; p->inpFiles[n]; n++);
+  for (n = 0; p->inpFiles[n]; n++)
+    ;
   return n;
 }
 
 /*****************************************************************************/
 /* Timer routines */
 
-static int is_async(enum pb_TimerID timer)
-{
-  return (timer == pb_TimerID_KERNEL) || 
-             (timer == pb_TimerID_COPY_ASYNC);
+static int is_async(enum pb_TimerID timer) {
+  return (timer == pb_TimerID_KERNEL) || (timer == pb_TimerID_COPY_ASYNC);
 }
 
-static int is_blocking(enum pb_TimerID timer)
-{
+static int is_blocking(enum pb_TimerID timer) {
   return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE);
 }
 
 #define INVALID_TIMERID pb_TimerID_LAST
 
-static int asyncs_outstanding(struct pb_TimerSet* timers)
-{
-  return (timers->async_markers != NULL) && 
-           (timers->async_markers->timerID != INVALID_TIMERID);
+static int asyncs_outstanding(struct pb_TimerSet *timers) {
+  return (timers->async_markers != NULL) &&
+         (timers->async_markers->timerID != INVALID_TIMERID);
 }
 
-static struct pb_async_time_marker_list * 
-get_last_async(struct pb_TimerSet* timers)
-{
+static struct pb_async_time_marker_list *
+get_last_async(struct pb_TimerSet *timers) {
   /* Find the last event recorded thus far */
-  struct pb_async_time_marker_list * last_event = timers->async_markers;
-  if(last_event != NULL && last_event->timerID != INVALID_TIMERID) {
-    while(last_event->next != NULL && 
-            last_event->next->timerID != INVALID_TIMERID)
+  struct pb_async_time_marker_list *last_event = timers->async_markers;
+  if (last_event != NULL && last_event->timerID != INVALID_TIMERID) {
+    while (last_event->next != NULL &&
+           last_event->next->timerID != INVALID_TIMERID)
       last_event = last_event->next;
     return last_event;
   } else
     return NULL;
-} 
+}
 
-static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer)
-{
-  struct pb_async_time_marker_list ** new_event = &(tset->async_markers);
+static void insert_marker(struct pb_TimerSet *tset, enum pb_TimerID timer) {
+  struct pb_async_time_marker_list **new_event = &(tset->async_markers);
 
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID)
+  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID)
     new_event = &((*new_event)->next);
 
-  if(*new_event == NULL) {
-    *new_event = (struct pb_async_time_marker_list *) 
-      			malloc(sizeof(struct pb_async_time_marker_list));
+  if (*new_event == NULL) {
+    *new_event = (struct pb_async_time_marker_list *)malloc(
+        sizeof(struct pb_async_time_marker_list));
     (*new_event)->marker = malloc(sizeof(cudaEvent_t));
     cudaEventCreate((*new_event)->marker);
     (*new_event)->next = NULL;
@@ -281,19 +251,18 @@ static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer)
   (*new_event)->label = NULL;
   (*new_event)->timerID = timer;
   cudaEventRecord(*((cudaEvent_t *)((*new_event)->marker)), 0);
-
 }
 
-static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer)
-{
-  struct pb_async_time_marker_list ** new_event = &(tset->async_markers);
+static void insert_submarker(struct pb_TimerSet *tset, char *label,
+                             enum pb_TimerID timer) {
+  struct pb_async_time_marker_list **new_event = &(tset->async_markers);
 
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID)
+  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID)
     new_event = &((*new_event)->next);
 
-  if(*new_event == NULL) {
-    *new_event = (struct pb_async_time_marker_list *) 
-      			malloc(sizeof(struct pb_async_time_marker_list));
+  if (*new_event == NULL) {
+    *new_event = (struct pb_async_time_marker_list *)malloc(
+        sizeof(struct pb_async_time_marker_list));
     (*new_event)->marker = malloc(sizeof(cudaEvent_t));
     cudaEventCreate((*new_event)->marker);
 
@@ -304,84 +273,73 @@ static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_Time
   (*new_event)->label = label;
   (*new_event)->timerID = timer;
   cudaEventRecord(*((cudaEvent_t *)((*new_event)->marker)), 0);
-
 }
 
-
 /* Assumes that all recorded events have completed */
-static pb_Timestamp record_async_times(struct pb_TimerSet* tset)
-{
-  struct pb_async_time_marker_list * next_interval = NULL;
-  struct pb_async_time_marker_list * last_marker = get_last_async(tset);
+static pb_Timestamp record_async_times(struct pb_TimerSet *tset) {
+  struct pb_async_time_marker_list *next_interval = NULL;
+  struct pb_async_time_marker_list *last_marker = get_last_async(tset);
   pb_Timestamp total_async_time = 0;
   enum pb_TimerID timer;
-  for(next_interval = tset->async_markers; next_interval != last_marker; 
-      next_interval = next_interval->next) {
+  for (next_interval = tset->async_markers; next_interval != last_marker;
+       next_interval = next_interval->next) {
     float interval_time_ms;
-    cudaEventElapsedTime(&interval_time_ms, *((cudaEvent_t *)next_interval->marker), 
-                                         *((cudaEvent_t *)next_interval->next->marker));
-    pb_Timestamp interval = (pb_Timestamp) (interval_time_ms * 1e3);
+    cudaEventElapsedTime(&interval_time_ms,
+                         *((cudaEvent_t *)next_interval->marker),
+                         *((cudaEvent_t *)next_interval->next->marker));
+    pb_Timestamp interval = (pb_Timestamp)(interval_time_ms * 1e3);
     tset->timers[next_interval->timerID].elapsed += interval;
     if (next_interval->label != NULL) {
-      struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list;
+      struct pb_SubTimer *subtimer =
+          tset->sub_timer_list[next_interval->timerID]->subtimer_list;
       while (subtimer != NULL) {
-        if ( strcmp(subtimer->label, next_interval->label) == 0) {
+        if (strcmp(subtimer->label, next_interval->label) == 0) {
           subtimer->timer.elapsed += interval;
           break;
         }
         subtimer = subtimer->next;
-      }      
-    }        
+      }
+    }
     total_async_time += interval;
     next_interval->timerID = INVALID_TIMERID;
   }
 
-  if(next_interval != NULL)
+  if (next_interval != NULL)
     next_interval->timerID = INVALID_TIMERID;
-    
 
-  
   return total_async_time;
 }
 
-static void
-accumulate_time(pb_Timestamp *accum,
-		pb_Timestamp start,
-		pb_Timestamp end)
-{
+static void accumulate_time(pb_Timestamp *accum, pb_Timestamp start,
+                            pb_Timestamp end) {
 #if _POSIX_VERSION >= 200112L
   *accum += end - start;
 #else
-# error "Timestamps not implemented for this system"
+#error "Timestamps not implemented for this system"
 #endif
 }
 
 #if _POSIX_VERSION >= 200112L
-static pb_Timestamp get_time()
-{
+static pb_Timestamp get_time() {
   struct timeval tv;
   gettimeofday(&tv, NULL);
-  return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec);
+  return (pb_Timestamp)(tv.tv_sec * 1000000LL + tv.tv_usec);
 }
 #else
-# error "no supported time libraries are available on this platform"
+#error "no supported time libraries are available on this platform"
 #endif
 
-void
-pb_ResetTimer(struct pb_Timer *timer)
-{
+void pb_ResetTimer(struct pb_Timer *timer) {
   timer->state = pb_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   timer->elapsed = 0;
 #else
-# error "pb_ResetTimer: not implemented for this system"
+#error "pb_ResetTimer: not implemented for this system"
 #endif
 }
 
-void
-pb_StartTimer(struct pb_Timer *timer)
-{
+void pb_StartTimer(struct pb_Timer *timer) {
   if (timer->state != pb_Timer_STOPPED) {
     fputs("Ignoring attempt to start a running timer\n", stderr);
     return;
@@ -396,13 +354,12 @@ pb_StartTimer(struct pb_Timer *timer)
     timer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
   }
 #else
-# error "pb_StartTimer: not implemented for this system"
+#error "pb_StartTimer: not implemented for this system"
 #endif
 }
 
-void
-pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
-{
+void pb_StartTimerAndSubTimer(struct pb_Timer *timer,
+                              struct pb_Timer *subtimer) {
 
   unsigned int numNotStopped = 0x3; // 11
   if (timer->state != pb_Timer_STOPPED) {
@@ -425,24 +382,21 @@ pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
   {
     struct timeval tv;
     gettimeofday(&tv, NULL);
-    
+
     if (numNotStopped & 0x2) {
       timer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
     }
-  
+
     if (numNotStopped & 0x1) {
       subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
     }
   }
 #else
-# error "pb_StartTimer: not implemented for this system"
+#error "pb_StartTimer: not implemented for this system"
 #endif
-
 }
 
-void
-pb_StopTimer(struct pb_Timer *timer)
-{
+void pb_StopTimer(struct pb_Timer *timer) {
   pb_Timestamp fini;
 
   if (timer->state != pb_Timer_RUNNING) {
@@ -459,14 +413,15 @@ pb_StopTimer(struct pb_Timer *timer)
     fini = tv.tv_sec * 1000000LL + tv.tv_usec;
   }
 #else
-# error "pb_StopTimer: not implemented for this system"
+#error "pb_StopTimer: not implemented for this system"
 #endif
 
   accumulate_time(&timer->elapsed, timer->init, fini);
   timer->init = fini;
 }
 
-void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) {
+void pb_StopTimerAndSubTimer(struct pb_Timer *timer,
+                             struct pb_Timer *subtimer) {
 
   pb_Timestamp fini;
 
@@ -484,7 +439,6 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     return;
   }
 
-
   timer->state = pb_Timer_STOPPED;
   subtimer->state = pb_Timer_STOPPED;
 
@@ -495,25 +449,22 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     fini = tv.tv_sec * 1000000LL + tv.tv_usec;
   }
 #else
-# error "pb_StopTimer: not implemented for this system"
+#error "pb_StopTimer: not implemented for this system"
 #endif
 
   if (numNotRunning & 0x2) {
     accumulate_time(&timer->elapsed, timer->init, fini);
     timer->init = fini;
   }
-  
+
   if (numNotRunning & 0x1) {
     accumulate_time(&subtimer->elapsed, subtimer->init, fini);
     subtimer->init = fini;
   }
-
 }
 
 /* Get the elapsed time in seconds. */
-double
-pb_GetElapsedTime(struct pb_Timer *timer)
-{
+double pb_GetElapsedTime(struct pb_Timer *timer) {
   double ret;
 
   if (timer->state != pb_Timer_STOPPED) {
@@ -523,14 +474,12 @@ pb_GetElapsedTime(struct pb_Timer *timer)
 #if _POSIX_VERSION >= 200112L
   ret = timer->elapsed / 1e6;
 #else
-# error "pb_GetElapsedTime: not implemented for this system"
+#error "pb_GetElapsedTime: not implemented for this system"
 #endif
   return ret;
 }
 
-void
-pb_InitializeTimerSet(struct pb_TimerSet *timers)
-{
+void pb_InitializeTimerSet(struct pb_TimerSet *timers) {
   int n;
 
   timers->wall_begin = get_time();
@@ -544,29 +493,26 @@ pb_InitializeTimerSet(struct pb_TimerSet *timers)
   }
 }
 
-void
-pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr) {
+void pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr) {}
 
-}
+void pb_AddSubTimer(struct pb_TimerSet *timers, char *label,
+                    enum pb_TimerID pb_Category) {
+
+  struct pb_SubTimer *subtimer =
+      (struct pb_SubTimer *)malloc(sizeof(struct pb_SubTimer));
 
-void
-pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) {  
-  
-  struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc
-    (sizeof(struct pb_SubTimer));
-    
   int len = strlen(label);
-    
-  subtimer->label = (char *) malloc (sizeof(char)*(len+1));
+
+  subtimer->label = (char *)malloc(sizeof(char) * (len + 1));
   sprintf(subtimer->label, "%s\0", label);
-  
+
   pb_ResetTimer(&subtimer->timer);
   subtimer->next = NULL;
-  
+
   struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category];
   if (subtimerlist == NULL) {
-    subtimerlist = (struct pb_SubTimerList *) malloc
-      (sizeof(struct pb_SubTimerList));
+    subtimerlist =
+        (struct pb_SubTimerList *)malloc(sizeof(struct pb_SubTimerList));
     subtimerlist->subtimer_list = subtimer;
     timers->sub_timer_list[pb_Category] = subtimerlist;
   } else {
@@ -577,21 +523,21 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ
     }
     element->next = subtimer;
   }
-  
 }
 
-void
-pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
-{
+void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) {
   /* Stop the currently running timer */
   if (timers->current != pb_TimerID_NONE) {
-    struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-    struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
-  
-    if (!is_async(timers->current) ) {
+    struct pb_SubTimerList *subtimerlist =
+        timers->sub_timer_list[timers->current];
+    struct pb_SubTimer *currSubTimer =
+        (subtimerlist != NULL) ? subtimerlist->current : NULL;
+
+    if (!is_async(timers->current)) {
       if (timers->current != timer) {
         if (currSubTimer != NULL) {
-          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
+          pb_StopTimerAndSubTimer(&timers->timers[timers->current],
+                                  &currSubTimer->timer);
         } else {
           pb_StopTimer(&timers->timers[timers->current]);
         }
@@ -607,67 +553,68 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
       }
     }
   }
-  
+
   pb_Timestamp currentTime = get_time();
 
-  /* The only cases we check for asynchronous task completion is 
-   * when an overlapping CPU operation completes, or the next 
+  /* The only cases we check for asynchronous task completion is
+   * when an overlapping CPU operation completes, or the next
    * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) && 
-      (!is_async(timers->current) || is_blocking(timer) ) ) {
+  if (asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(timer))) {
 
-    struct pb_async_time_marker_list * last_event = get_last_async(timers);
+    struct pb_async_time_marker_list *last_event = get_last_async(timers);
     /* cudaSuccess if completed */
-    cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker));
+    cudaError_t async_done =
+        cudaEventQuery(*((cudaEvent_t *)last_event->marker));
 
-    if(is_blocking(timer)) {
-      /* Async operations completed after previous CPU operations: 
-       * overlapped time is the total CPU time since this set of async 
+    if (is_blocking(timer)) {
+      /* Async operations completed after previous CPU operations:
+       * overlapped time is the total CPU time since this set of async
        * operations were first issued */
-       
-      // timer to switch to is COPY or NONE 
-      if(async_done != cudaSuccess) 
-        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), 
-	                  timers->async_begin,currentTime);
+
+      // timer to switch to is COPY or NONE
+      if (async_done != cudaSuccess)
+        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed),
+                        timers->async_begin, currentTime);
 
       /* Wait on async operation completion */
       cudaEventSynchronize(*((cudaEvent_t *)last_event->marker));
       pb_Timestamp total_async_time = record_async_times(timers);
 
-      /* Async operations completed before previous CPU operations: 
+      /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      if(async_done == cudaSuccess)
+      if (async_done == cudaSuccess)
         timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;
 
-    } else 
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == cudaSuccess) {
-      /* Async operations completed before previous CPU operations: 
+    } else
+        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
+        // outstanding so something is deeper in stack
+        if (async_done == cudaSuccess) {
+      /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
       timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
-    }   
+    }
   }
 
   /* Start the new timer */
   if (timer != pb_TimerID_NONE) {
-    if(!is_async(timer)) {
+    if (!is_async(timer)) {
       pb_StartTimer(&timers->timers[timer]);
     } else {
       // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
       if (!asyncs_outstanding(timers)) {
         /* No asyncs outstanding, insert a fresh async marker */
-      
+
         insert_marker(timers, timer);
         timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
+      } else if (!is_async(timers->current)) {
         /* Previous asyncs still in flight, but a previous SwitchTo
-         * already marked the end of the most recent async operation, 
-         * so we can rename that marker as the beginning of this async 
+         * already marked the end of the most recent async operation,
+         * so we can rename that marker as the beginning of this async
          * operation */
-         
-        struct pb_async_time_marker_list * last_event = get_last_async(timers);
+
+        struct pb_async_time_marker_list *last_event = get_last_async(timers);
         last_event->label = NULL;
         last_event->timerID = timer;
       }
@@ -677,20 +624,21 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
     }
   }
   timers->current = timer;
-
 }
 
-void
-pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) 
-{
-  struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-  struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
-  
+void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label,
+                         enum pb_TimerID category) {
+  struct pb_SubTimerList *subtimerlist =
+      timers->sub_timer_list[timers->current];
+  struct pb_SubTimer *curr =
+      (subtimerlist != NULL) ? subtimerlist->current : NULL;
+
   if (timers->current != pb_TimerID_NONE) {
-    if (!is_async(timers->current) ) {
+    if (!is_async(timers->current)) {
       if (timers->current != category) {
         if (curr != NULL) {
-          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
+          pb_StopTimerAndSubTimer(&timers->timers[timers->current],
+                                  &curr->timer);
         } else {
           pb_StopTimer(&timers->timers[timers->current]);
         }
@@ -709,56 +657,59 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
 
   pb_Timestamp currentTime = get_time();
 
-  /* The only cases we check for asynchronous task completion is 
-   * when an overlapping CPU operation completes, or the next 
+  /* The only cases we check for asynchronous task completion is
+   * when an overlapping CPU operation completes, or the next
    * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) && 
-      (!is_async(timers->current) || is_blocking(category) ) ) {
+  if (asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(category))) {
 
-    struct pb_async_time_marker_list * last_event = get_last_async(timers);
+    struct pb_async_time_marker_list *last_event = get_last_async(timers);
     /* cudaSuccess if completed */
-    cudaError_t async_done = cudaEventQuery(*((cudaEvent_t *)last_event->marker));
+    cudaError_t async_done =
+        cudaEventQuery(*((cudaEvent_t *)last_event->marker));
 
-    if(is_blocking(category)) {
-      /* Async operations completed after previous CPU operations: 
-       * overlapped time is the total CPU time since this set of async 
+    if (is_blocking(category)) {
+      /* Async operations completed after previous CPU operations:
+       * overlapped time is the total CPU time since this set of async
        * operations were first issued */
-       
-      // timer to switch to is COPY or NONE 
-      // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
-      // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
-      // it seems that the extra sync wall time isn't being recorded anywhere
-      if(async_done != cudaSuccess) 
-        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), 
-	                  timers->async_begin,currentTime);
+
+      // timer to switch to is COPY or NONE
+      // if it hasn't already finished, then just take now and use that as the
+      // elapsed time in OVERLAP anything happening after now isn't OVERLAP
+      // because everything is being stopped to wait for synchronization it
+      // seems that the extra sync wall time isn't being recorded anywhere
+      if (async_done != cudaSuccess)
+        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed),
+                        timers->async_begin, currentTime);
 
       /* Wait on async operation completion */
       cudaEventSynchronize(*((cudaEvent_t *)last_event->marker));
       pb_Timestamp total_async_time = record_async_times(timers);
 
-      /* Async operations completed before previous CPU operations: 
+      /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-       // If it did finish, then accumulate all the async time that did happen into OVERLAP
-       // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
-      if(async_done == cudaSuccess)
+      // If it did finish, then accumulate all the async time that did happen
+      // into OVERLAP the immediately preceding EventSynchronize theoretically
+      // didn't have any effect since it was already completed.
+      if (async_done == cudaSuccess)
         timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;
 
-    } else 
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == cudaSuccess) {
-      /* Async operations completed before previous CPU operations: 
+    } else
+        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
+        // outstanding so something is deeper in stack
+        if (async_done == cudaSuccess) {
+      /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
       timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
-    }   
+    }
     // else, this isn't blocking, so just check the next time around
   }
-  
+
   subtimerlist = timers->sub_timer_list[category];
   struct pb_SubTimer *subtimer = NULL;
-  
-  if (label != NULL) {  
+
+  if (label != NULL) {
     subtimer = subtimerlist->subtimer_list;
     while (subtimer != NULL) {
       if (strcmp(subtimer->label, label) == 0) {
@@ -771,80 +722,81 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
 
   /* Start the new timer */
   if (category != pb_TimerID_NONE) {
-    if(!is_async(category)) {
-    
+    if (!is_async(category)) {
+
       if (subtimerlist != NULL) {
         subtimerlist->current = subtimer;
       }
-    
+
       if (category != timers->current && subtimer != NULL) {
         pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
       } else if (subtimer != NULL) {
         pb_StartTimer(&subtimer->timer);
       } else {
         pb_StartTimer(&timers->timers[category]);
-      }            
+      }
     } else {
       if (subtimerlist != NULL) {
         subtimerlist->current = subtimer;
       }
-    
+
       // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
       if (!asyncs_outstanding(timers)) {
         /* No asyncs outstanding, insert a fresh async marker */
         insert_submarker(timers, label, category);
         timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
+      } else if (!is_async(timers->current)) {
         /* Previous asyncs still in flight, but a previous SwitchTo
-         * already marked the end of the most recent async operation, 
-         * so we can rename that marker as the beginning of this async 
+         * already marked the end of the most recent async operation,
+         * so we can rename that marker as the beginning of this async
          * operation */
-                  
-        struct pb_async_time_marker_list * last_event = get_last_async(timers);
+
+        struct pb_async_time_marker_list *last_event = get_last_async(timers);
         last_event->timerID = category;
         last_event->label = label;
       } // else, marker for switchToThis was already inserted
-      
-      //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
+
+      // toSwitchto is already asynchronous, but if current/prev state is async
+      // too, then DRIVER is already running
       if (!is_async(timers->current)) {
         pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]);
       }
     }
   }
-  
-  timers->current = category;  
+
+  timers->current = category;
 }
 
-void
-pb_PrintTimerSet(struct pb_TimerSet *timers)
-{
+void pb_PrintTimerSet(struct pb_TimerSet *timers) {
   pb_Timestamp wall_end = get_time();
 
   struct pb_Timer *t = timers->timers;
-  struct pb_SubTimer* sub = NULL;
-  
+  struct pb_SubTimer *sub = NULL;
+
   int maxSubLength;
-    
-//  const char *categories[] = {
-//    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
-//  };
-  const char *categories[] = {
-    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap",
-    "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free",
-    "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc",
-    "Pthread_Create", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack"
-  };
 
+  //  const char *categories[] = {
+  //    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
+  //  };
+  const char *categories[] = {
+      "IO",          "Kernel",         "Copy",       "Driver",
+      "Copy Async",  "Compute",        "Overlap",    "Init_Ctx",
+      "Clear_Ctx",   "Copy_Scalar",    "Copy_Ptr",   "Mem_Free",
+      "Read_Output", "Setup",          "Mem_Track",  "Mem_Untrack",
+      "Misc",        "Pthread_Create", "Arg_Unpack", "Computation",
+      "Output_Pack", "Output_Unpack"};
 
   const int maxCategoryLength = 10;
-  
+
   int i;
-  for(i = 1; i < pb_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format
-    if(pb_GetElapsedTime(&t[i]) != 0 || true) {
-    
+  for (i = 1; i < pb_TimerID_LAST;
+       ++i) { // exclude NONE and OVRELAP from this format
+    if (pb_GetElapsedTime(&t[i]) != 0 || true) {
+
       // Print Category Timer
-      printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i]));
-      
+      printf("%-*s: %f\n", maxCategoryLength, categories[i - 1],
+             pb_GetElapsedTime(&t[i]));
+
       if (timers->sub_timer_list[i] != NULL) {
         sub = timers->sub_timer_list[i]->subtimer_list;
         maxSubLength = 0;
@@ -855,47 +807,47 @@ pb_PrintTimerSet(struct pb_TimerSet *timers)
           }
           sub = sub->next;
         }
-        
+
         // Fit to Categories
         if (maxSubLength <= maxCategoryLength) {
-         maxSubLength = maxCategoryLength;
+          maxSubLength = maxCategoryLength;
         }
-        
+
         sub = timers->sub_timer_list[i]->subtimer_list;
-        
+
         // Print SubTimers
         while (sub != NULL) {
-          printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer));
+          printf(" -%-*s: %f\n", maxSubLength, sub->label,
+                 pb_GetElapsedTime(&sub->timer));
           sub = sub->next;
         }
       }
     }
   }
-  
-  if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
-    printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
-        
-  float walltime = (wall_end - timers->wall_begin)/ 1e6;
+
+  if (pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
+    printf("CPU/Kernel Overlap: %f\n",
+           pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
+
+  float walltime = (wall_end - timers->wall_begin) / 1e6;
   printf("Timer Wall Time: %f\n", walltime);
-  
 }
 
-void pb_DestroyTimerSet(struct pb_TimerSet * timers)
-{
+void pb_DestroyTimerSet(struct pb_TimerSet *timers) {
   /* clean up all of the async event markers */
-  struct pb_async_time_marker_list ** event = &(timers->async_markers);
-  while( *event != NULL) {
+  struct pb_async_time_marker_list **event = &(timers->async_markers);
+  while (*event != NULL) {
     cudaEventSynchronize(*((cudaEvent_t *)(*event)->marker));
     cudaEventDestroy(*((cudaEvent_t *)(*event)->marker));
     free((*event)->marker);
-    struct pb_async_time_marker_list ** next = &((*event)->next);
+    struct pb_async_time_marker_list **next = &((*event)->next);
     free(*event);
     (*event) = NULL;
     event = next;
   }
-  
+
   int i = 0;
-  for(i = 0; i < pb_TimerID_LAST; ++i) {    
+  for (i = 0; i < pb_TimerID_LAST; ++i) {
     if (timers->sub_timer_list[i] != NULL) {
       struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
       struct pb_SubTimer *prev = NULL;
@@ -909,5 +861,3 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers)
     }
   }
 }
-
-
diff --git a/hpvm/test/parboil/common/src/parboil_opencl.c b/hpvm/test/parboil/common/src/parboil_opencl.c
index 5f1937f356892489bd78ed5b2fb238d886de2f9a..d493992acee859186d58330a9988ef7ef2571f73 100644
--- a/hpvm/test/parboil/common/src/parboil_opencl.c
+++ b/hpvm/test/parboil/common/src/parboil_opencl.c
@@ -2,47 +2,47 @@
  * (c) 2007 The Board of Trustees of the University of Illinois.
  */
 
+#include <CL/cl.h>
+#include <assert.h>
 #include <parboil.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdio.h>
-#include <assert.h>
-#include <CL/cl.h>
 
 #if _POSIX_VERSION >= 200112L
 #include <time.h>
 #endif
 
-#define BILLION   1000000000LL
-#define true      1
+#define BILLION 1000000000LL
+#define true 1
 
 cl_context *clContextPtr;
 cl_command_queue *clCommandQueuePtr;
 
 /* Free an array of owned strings. */
-static void
-free_string_array(char **string_array)
-{
+static void free_string_array(char **string_array) {
   char **p;
 
-  if (!string_array) return;
-  for (p = string_array; *p; p++) free(*p);
+  if (!string_array)
+    return;
+  for (p = string_array; *p; p++)
+    free(*p);
   free(string_array);
 }
 
 /* Parse a comma-delimited list of strings into an
  * array of strings. */
-static char **
-read_string_array(char *in)
-{
+static char **read_string_array(char *in) {
   char **ret;
   int i;
-  int count;			/* Number of items in the input */
-  char *substring;		/* Current substring within 'in' */
+  int count;       /* Number of items in the input */
+  char *substring; /* Current substring within 'in' */
 
   /* Count the number of items in the string */
   count = 1;
-  for (i = 0; in[i]; i++) if (in[i] == ',') count++;
+  for (i = 0; in[i]; i++)
+    if (in[i] == ',')
+      count++;
 
   /* Allocate storage */
   ret = (char **)malloc((count + 1) * sizeof(char *));
@@ -55,8 +55,8 @@ read_string_array(char *in)
 
     /* Find length of substring */
     for (substring_end = substring;
-	 (*substring_end != ',') && (*substring_end != 0);
-	 substring_end++);
+         (*substring_end != ',') && (*substring_end != 0); substring_end++)
+      ;
 
     substring_length = substring_end - substring;
 
@@ -68,43 +68,37 @@ read_string_array(char *in)
     /* go to next substring */
     substring = substring_end + 1;
   }
-  ret[i] = NULL;		/* Write the sentinel value */
+  ret[i] = NULL; /* Write the sentinel value */
 
   return ret;
 }
 
 struct argparse {
-  int argc;			/* Number of arguments.  Mutable. */
-  char **argv;			/* Argument values.  Immutable. */
+  int argc;    /* Number of arguments.  Mutable. */
+  char **argv; /* Argument values.  Immutable. */
 
-  int argn;			/* Current argument number. */
-  char **argv_get;		/* Argument value being read. */
-  char **argv_put;		/* Argument value being written.
-				 * argv_put <= argv_get. */
+  int argn;        /* Current argument number. */
+  char **argv_get; /* Argument value being read. */
+  char **argv_put; /* Argument value being written.
+                    * argv_put <= argv_get. */
 };
 
-static void
-initialize_argparse(struct argparse *ap, int argc, char **argv)
-{
+static void initialize_argparse(struct argparse *ap, int argc, char **argv) {
   ap->argc = argc;
   ap->argn = 0;
   ap->argv_get = ap->argv_put = ap->argv = argv;
 }
 
-static void
-finalize_argparse(struct argparse *ap)
-{
+static void finalize_argparse(struct argparse *ap) {
   /* Move the remaining arguments */
-  for(; ap->argn < ap->argc; ap->argn++)
+  for (; ap->argn < ap->argc; ap->argn++)
     *ap->argv_put++ = *ap->argv_get++;
 }
 
 /* Delete the current argument. */
-static void
-delete_argument(struct argparse *ap)
-{
+static void delete_argument(struct argparse *ap) {
   if (ap->argn >= ap->argc) {
-    //fprintf(stderr, "delete_argument\n");
+    // fprintf(stderr, "delete_argument\n");
   }
   ap->argc--;
   ap->argv_get++;
@@ -112,44 +106,32 @@ delete_argument(struct argparse *ap)
 
 /* Go to the next argument.  Also, move the current argument to its
  * final location in argv. */
-static void
-next_argument(struct argparse *ap)
-{
+static void next_argument(struct argparse *ap) {
   if (ap->argn >= ap->argc) {
-    //fprintf(stderr, "next_argument\n");
+    // fprintf(stderr, "next_argument\n");
   }
   /* Move argument to its new location. */
   *ap->argv_put++ = *ap->argv_get++;
   ap->argn++;
 }
 
-static int
-is_end_of_arguments(struct argparse *ap)
-{
+static int is_end_of_arguments(struct argparse *ap) {
   return ap->argn == ap->argc;
 }
 
-static char *
-get_argument(struct argparse *ap)
-{
-  return *ap->argv_get;
-}
+static char *get_argument(struct argparse *ap) { return *ap->argv_get; }
 
-static char *
-consume_argument(struct argparse *ap)
-{
+static char *consume_argument(struct argparse *ap) {
   char *ret = get_argument(ap);
   delete_argument(ap);
   return ret;
 }
 
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv)
-{
+struct pb_Parameters *pb_ReadParameters(int *_argc, char **argv) {
   char *err_message;
   struct argparse ap;
   struct pb_Parameters *ret =
-    (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
+      (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
 
   /* Initialize the parameters structure */
   ret->outFile = NULL;
@@ -158,59 +140,54 @@ pb_ReadParameters(int *_argc, char **argv)
 
   /* Each argument */
   initialize_argparse(&ap, *_argc, argv);
-  while(!is_end_of_arguments(&ap)) {
+  while (!is_end_of_arguments(&ap)) {
     char *arg = get_argument(&ap);
 
     /* Single-character flag */
     if ((arg[0] == '-') && (arg[1] != 0) && (arg[2] == 0)) {
-      delete_argument(&ap);	/* This argument is consumed here */
-
-      switch(arg[1]) {
-      case 'o':			/* Output file name */
-	if (is_end_of_arguments(&ap))
-	  {
-	    err_message = "Expecting file name after '-o'\n";
-	    goto error;
-	  }
-	free(ret->outFile);
-	ret->outFile = strdup(consume_argument(&ap));
-	break;
-      case 'i':			/* Input file name */
-	if (is_end_of_arguments(&ap))
-	  {
-	    err_message = "Expecting file name after '-i'\n";
-	    goto error;
-	  }
-	ret->inpFiles = read_string_array(consume_argument(&ap));
-	break;
-      case '-':			/* End of options */
-	goto end_of_options;
+      delete_argument(&ap); /* This argument is consumed here */
+
+      switch (arg[1]) {
+      case 'o': /* Output file name */
+        if (is_end_of_arguments(&ap)) {
+          err_message = "Expecting file name after '-o'\n";
+          goto error;
+        }
+        free(ret->outFile);
+        ret->outFile = strdup(consume_argument(&ap));
+        break;
+      case 'i': /* Input file name */
+        if (is_end_of_arguments(&ap)) {
+          err_message = "Expecting file name after '-i'\n";
+          goto error;
+        }
+        ret->inpFiles = read_string_array(consume_argument(&ap));
+        break;
+      case '-': /* End of options */
+        goto end_of_options;
       default:
-	err_message = "Unexpected command-line parameter\n";
-	goto error;
+        err_message = "Unexpected command-line parameter\n";
+        goto error;
       }
-    }
-    else {
+    } else {
       /* Other parameters are ignored */
       next_argument(&ap);
     }
   } /* end for each argument */
 
- end_of_options:
-  *_argc = ap.argc;		/* Save the modified argc value */
+end_of_options:
+  *_argc = ap.argc; /* Save the modified argc value */
   finalize_argparse(&ap);
 
   return ret;
 
- error:
+error:
   fputs(err_message, stderr);
   pb_FreeParameters(ret);
   return NULL;
 }
 
-void
-pb_FreeParameters(struct pb_Parameters *p)
-{
+void pb_FreeParameters(struct pb_Parameters *p) {
   char **cpp;
 
   free(p->outFile);
@@ -218,79 +195,72 @@ pb_FreeParameters(struct pb_Parameters *p)
   free(p);
 }
 
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p)
-{
+int pb_Parameters_CountInputs(struct pb_Parameters *p) {
   int n;
 
-  for (n = 0; p->inpFiles[n]; n++);
+  for (n = 0; p->inpFiles[n]; n++)
+    ;
   return n;
 }
 
 /*****************************************************************************/
 /* Timer routines */
 
-static int is_async(enum pb_TimerID timer)
-{
+static int is_async(enum pb_TimerID timer) {
 #ifndef OPENCL_CPU
-  return (timer == pb_TimerID_KERNEL) ||
-             (timer == pb_TimerID_COPY_ASYNC);
+  return (timer == pb_TimerID_KERNEL) || (timer == pb_TimerID_COPY_ASYNC);
 #else
   return (timer == pb_TimerID_COPY_ASYNC);
 #endif
 }
 
-static int is_blocking(enum pb_TimerID timer)
-{
+static int is_blocking(enum pb_TimerID timer) {
   return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE);
 }
 
 #define INVALID_TIMERID pb_TimerID_LAST
 
-static int asyncs_outstanding(struct pb_TimerSet* timers)
-{
+static int asyncs_outstanding(struct pb_TimerSet *timers) {
   return (timers->async_markers != NULL) &&
-           (timers->async_markers->timerID != INVALID_TIMERID);
+         (timers->async_markers->timerID != INVALID_TIMERID);
 }
 
 static struct pb_async_time_marker_list *
-get_last_async(struct pb_TimerSet* timers)
-{
+get_last_async(struct pb_TimerSet *timers) {
   /* Find the last event recorded thus far */
-  struct pb_async_time_marker_list * last_event = timers->async_markers;
-  if(last_event != NULL && last_event->timerID != INVALID_TIMERID) {
-    while(last_event->next != NULL &&
-            last_event->next->timerID != INVALID_TIMERID)
+  struct pb_async_time_marker_list *last_event = timers->async_markers;
+  if (last_event != NULL && last_event->timerID != INVALID_TIMERID) {
+    while (last_event->next != NULL &&
+           last_event->next->timerID != INVALID_TIMERID)
       last_event = last_event->next;
     return last_event;
   } else
     return NULL;
 }
 
-static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer)
-{
+static void insert_marker(struct pb_TimerSet *tset, enum pb_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct pb_async_time_marker_list ** new_event = &(tset->async_markers);
+  struct pb_async_time_marker_list **new_event = &(tset->async_markers);
 
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
+  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
-  if(*new_event == NULL) {
-    *new_event = (struct pb_async_time_marker_list *)
-      			malloc(sizeof(struct pb_async_time_marker_list));
+  if (*new_event == NULL) {
+    *new_event = (struct pb_async_time_marker_list *)malloc(
+        sizeof(struct pb_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
-    // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event
-#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
-fprintf(stderr, "Creating Marker [%d]\n", timer);
-    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Creating User Event Object!\n");
+    // I don't think this is needed at all. I believe clEnqueueMarker 'creates'
+the event #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) fprintf(stderr, "Creating
+Marker [%d]\n", timer);
+    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr,
+&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User
+Event Object!\n");
     }
-    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Setting User Event Status!\n");
+    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)),
+CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User
+Event Status!\n");
     }
 #endif
 */
@@ -300,36 +270,36 @@ fprintf(stderr, "Creating Marker [%d]\n", timer);
   /* valid event handle now aquired: insert the event record */
   (*new_event)->label = NULL;
   (*new_event)->timerID = timer;
-  ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker);
+  ciErrNum =
+      clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker);
   if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Enqueueing Marker!\n");
+    fprintf(stderr, "Error Enqueueing Marker!\n");
   }
-
 }
 
-static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer)
-{
+static void insert_submarker(struct pb_TimerSet *tset, char *label,
+                             enum pb_TimerID timer) {
   cl_int ciErrNum = CL_SUCCESS;
-  struct pb_async_time_marker_list ** new_event = &(tset->async_markers);
+  struct pb_async_time_marker_list **new_event = &(tset->async_markers);
 
-  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
+  while (*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
     new_event = &((*new_event)->next);
   }
 
-  if(*new_event == NULL) {
-    *new_event = (struct pb_async_time_marker_list *)
-      			malloc(sizeof(struct pb_async_time_marker_list));
+  if (*new_event == NULL) {
+    *new_event = (struct pb_async_time_marker_list *)malloc(
+        sizeof(struct pb_async_time_marker_list));
     (*new_event)->marker = calloc(1, sizeof(cl_event));
     /*
 #if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
 fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
-    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Creating User Event Object!\n");
+    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr,
+&ciErrNum); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Creating User
+Event Object!\n");
     }
-    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
-    if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Setting User Event Status!\n");
+    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)),
+CL_QUEUED); if (ciErrNum != CL_SUCCESS) { fprintf(stderr, "Error Setting User
+Event Status!\n");
     }
 #endif
 */
@@ -339,43 +309,48 @@ fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
   /* valid event handle now aquired: insert the event record */
   (*new_event)->label = label;
   (*new_event)->timerID = timer;
-  ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker);
+  ciErrNum =
+      clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker);
   if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error Enqueueing Marker!\n");
+    fprintf(stderr, "Error Enqueueing Marker!\n");
   }
-
 }
 
-
 /* Assumes that all recorded events have completed */
-static pb_Timestamp record_async_times(struct pb_TimerSet* tset)
-{
-  struct pb_async_time_marker_list * next_interval = NULL;
-  struct pb_async_time_marker_list * last_marker = get_last_async(tset);
+static pb_Timestamp record_async_times(struct pb_TimerSet *tset) {
+  struct pb_async_time_marker_list *next_interval = NULL;
+  struct pb_async_time_marker_list *last_marker = get_last_async(tset);
   pb_Timestamp total_async_time = 0;
   enum pb_TimerID timer;
 
-  for(next_interval = tset->async_markers; next_interval != last_marker;
-      next_interval = next_interval->next) {
-    cl_ulong command_start=0, command_end=0;
+  for (next_interval = tset->async_markers; next_interval != last_marker;
+       next_interval = next_interval->next) {
+    cl_ulong command_start = 0, command_end = 0;
     cl_int ciErrNum = CL_SUCCESS;
 
-    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL);
+    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker),
+                                       CL_PROFILING_COMMAND_END,
+                                       sizeof(cl_ulong), &command_start, NULL);
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum);
     }
 
-    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL);
+    ciErrNum = clGetEventProfilingInfo(
+        *((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END,
+        sizeof(cl_ulong), &command_end, NULL);
     if (ciErrNum != CL_SUCCESS) {
-      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum);
+      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n",
+              ciErrNum);
     }
 
-    pb_Timestamp interval = (pb_Timestamp) (((double)(command_end - command_start)));
+    pb_Timestamp interval =
+        (pb_Timestamp)(((double)(command_end - command_start)));
     tset->timers[next_interval->timerID].elapsed += interval;
     if (next_interval->label != NULL) {
-      struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list;
+      struct pb_SubTimer *subtimer =
+          tset->sub_timer_list[next_interval->timerID]->subtimer_list;
       while (subtimer != NULL) {
-        if ( strcmp(subtimer->label, next_interval->label) == 0) {
+        if (strcmp(subtimer->label, next_interval->label) == 0) {
           subtimer->timer.elapsed += interval;
           break;
         }
@@ -386,50 +361,42 @@ static pb_Timestamp record_async_times(struct pb_TimerSet* tset)
     next_interval->timerID = INVALID_TIMERID;
   }
 
-  if(next_interval != NULL)
+  if (next_interval != NULL)
     next_interval->timerID = INVALID_TIMERID;
 
   return total_async_time;
 }
 
-static void
-accumulate_time(pb_Timestamp *accum,
-		pb_Timestamp start,
-		pb_Timestamp end)
-{
+static void accumulate_time(pb_Timestamp *accum, pb_Timestamp start,
+                            pb_Timestamp end) {
 #if _POSIX_VERSION >= 200112L
   *accum += end - start;
 #else
-# error "Timestamps not implemented for this system"
+#error "Timestamps not implemented for this system"
 #endif
 }
 
 #if _POSIX_VERSION >= 200112L
-static pb_Timestamp get_time()
-{
+static pb_Timestamp get_time() {
   struct timespec tv;
   clock_gettime(CLOCK_MONOTONIC, &tv);
-  return (pb_Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec);
+  return (pb_Timestamp)(tv.tv_sec * BILLION + tv.tv_nsec);
 }
 #else
-# error "no supported time libraries are available on this platform"
+#error "no supported time libraries are available on this platform"
 #endif
 
-void
-pb_ResetTimer(struct pb_Timer *timer)
-{
+void pb_ResetTimer(struct pb_Timer *timer) {
   timer->state = pb_Timer_STOPPED;
 
 #if _POSIX_VERSION >= 200112L
   timer->elapsed = 0;
 #else
-# error "pb_ResetTimer: not implemented for this system"
+#error "pb_ResetTimer: not implemented for this system"
 #endif
 }
 
-void
-pb_StartTimer(struct pb_Timer *timer)
-{
+void pb_StartTimer(struct pb_Timer *timer) {
   if (timer->state != pb_Timer_STOPPED) {
     fputs("Ignoring attempt to start a running timer\n", stderr);
     return;
@@ -444,13 +411,12 @@ pb_StartTimer(struct pb_Timer *timer)
     timer->init = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "pb_StartTimer: not implemented for this system"
+#error "pb_StartTimer: not implemented for this system"
 #endif
 }
 
-void
-pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
-{
+void pb_StartTimerAndSubTimer(struct pb_Timer *timer,
+                              struct pb_Timer *subtimer) {
 
   unsigned int numNotStopped = 0x3; // 11
   if (timer->state != pb_Timer_STOPPED) {
@@ -483,14 +449,11 @@ pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     }
   }
 #else
-# error "pb_StartTimer: not implemented for this system"
+#error "pb_StartTimer: not implemented for this system"
 #endif
-
 }
 
-void
-pb_StopTimer(struct pb_Timer *timer)
-{
+void pb_StopTimer(struct pb_Timer *timer) {
   pb_Timestamp fini;
 
   if (timer->state != pb_Timer_RUNNING) {
@@ -507,14 +470,15 @@ pb_StopTimer(struct pb_Timer *timer)
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "pb_StopTimer: not implemented for this system"
+#error "pb_StopTimer: not implemented for this system"
 #endif
 
   accumulate_time(&timer->elapsed, timer->init, fini);
   timer->init = fini;
 }
 
-void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) {
+void pb_StopTimerAndSubTimer(struct pb_Timer *timer,
+                             struct pb_Timer *subtimer) {
 
   pb_Timestamp fini;
 
@@ -532,7 +496,6 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     return;
   }
 
-
   timer->state = pb_Timer_STOPPED;
   subtimer->state = pb_Timer_STOPPED;
 
@@ -543,7 +506,7 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     fini = tv.tv_sec * BILLION + tv.tv_nsec;
   }
 #else
-# error "pb_StopTimer: not implemented for this system"
+#error "pb_StopTimer: not implemented for this system"
 #endif
 
   if (numNotRunning & 0x2) {
@@ -555,13 +518,10 @@ void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
     accumulate_time(&subtimer->elapsed, subtimer->init, fini);
     subtimer->init = fini;
   }
-
 }
 
 /* Get the elapsed time in seconds. */
-double
-pb_GetElapsedTime(struct pb_Timer *timer)
-{
+double pb_GetElapsedTime(struct pb_Timer *timer) {
   double ret;
 
   if (timer->state != pb_Timer_STOPPED) {
@@ -571,14 +531,12 @@ pb_GetElapsedTime(struct pb_Timer *timer)
 #if _POSIX_VERSION >= 200112L
   ret = timer->elapsed / 1e9;
 #else
-# error "pb_GetElapsedTime: not implemented for this system"
+#error "pb_GetElapsedTime: not implemented for this system"
 #endif
   return ret;
 }
 
-void
-pb_InitializeTimerSet(struct pb_TimerSet *timers)
-{
+void pb_InitializeTimerSet(struct pb_TimerSet *timers) {
   int n;
 
   timers->wall_begin = get_time();
@@ -597,16 +555,14 @@ void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) {
   clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr);
 }
 
-static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
-{
+static char *LoadProgSource(const char *Filename, size_t *szFinalLength) {
   // locals
-  FILE* pFileStream = NULL;
+  FILE *pFileStream = NULL;
   size_t szSourceLength;
 
   // open the OpenCL source code file
   pFileStream = fopen(Filename, "rb");
-  if(pFileStream == 0)
-  {
+  if (pFileStream == 0) {
     return NULL;
   }
 
@@ -616,60 +572,62 @@ static char* LoadProgSource(const char* Filename, size_t* szFinalLength)
   fseek(pFileStream, 0, SEEK_SET);
 
   // allocate a buffer for the source code string and read it in
-  char* cSourceString = (char *)malloc(szSourceLength + 1);
-  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
-  {
-      fclose(pFileStream);
-      free(cSourceString);
-      return 0;
+  char *cSourceString = (char *)malloc(szSourceLength + 1);
+  if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) {
+    fclose(pFileStream);
+    free(cSourceString);
+    return 0;
   }
 
-  // close the file and return the total length of the combined (preamble + source) string
+  // close the file and return the total length of the combined (preamble +
+  // source) string
   fclose(pFileStream);
-  if(szFinalLength != 0)
-  {
-      *szFinalLength = szSourceLength;
+  if (szFinalLength != 0) {
+    *szFinalLength = szSourceLength;
   }
   cSourceString[szSourceLength] = '\0';
 
   return cSourceString;
 }
 
-static inline void checkErr(cl_int err, cl_int success, const char * name) {
+static inline void checkErr(cl_int err, cl_int success, const char *name) {
   if (err != success) {
     printf("ERROR: %s\n", name);
     exit(EXIT_FAILURE);
   }
 }
 
-void pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, void* clContextPtr, void* clDevicePtr, void* clProgramPtr, void* clKernelPtr) {
+void pb_CreateAndBuildKernelFromBinary(const char *file, const char *kernel,
+                                       void *clContextPtr, void *clDevicePtr,
+                                       void *clProgramPtr, void *clKernelPtr) {
   size_t kernelLength;
   char *programSource = LoadProgSource(file, &kernelLength);
-  checkErr(programSource != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+  checkErr(programSource != NULL, 1 /*bool true*/,
+           "Failure to load Program Binary");
 
   cl_int binaryStatus;
   cl_int errcode;
-  cl_device_id clDevice = *(cl_device_id*) clDevicePtr;
-  cl_context clContext = *(cl_context*) clContextPtr;
-  cl_program clProgram = clCreateProgramWithBinary(clContext, 1, &clDevice,
-                                        &kernelLength,
-                                        (const unsigned char **)&programSource,
-                                        &binaryStatus, &errcode);
+  cl_device_id clDevice = *(cl_device_id *)clDevicePtr;
+  cl_context clContext = *(cl_context *)clContextPtr;
+  cl_program clProgram = clCreateProgramWithBinary(
+      clContext, 1, &clDevice, &kernelLength,
+      (const unsigned char **)&programSource, &binaryStatus, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
   // printf("Building kernel - %s, from file %s\n", kernel, file);
   errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
   // If build fails, get build log from device
-  if(errcode != CL_SUCCESS) {
+  if (errcode != CL_SUCCESS) {
     printf("ERROR: Failure to build program\n");
     size_t len = 0;
-    errcode = clGetProgramBuildInfo(clProgram, clDevice , CL_PROGRAM_BUILD_LOG, 0,
-        NULL, &len);
+    errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
+                                    0, NULL, &len);
     printf("LOG LENGTH: %lu\n", len);
-    checkErr(errcode, CL_SUCCESS, "Failure to collect program build log length");
-    char *log = (char*) malloc(len*sizeof(char));
-    errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, len,
-        log, NULL);
+    checkErr(errcode, CL_SUCCESS,
+             "Failure to collect program build log length");
+    char *log = (char *)malloc(len * sizeof(char));
+    errcode = clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
+                                    len, log, NULL);
     checkErr(errcode, CL_SUCCESS, "Failure to collect program build log");
 
     printf("Device Build Log: %s\n", log);
@@ -679,22 +637,22 @@ void pb_CreateAndBuildKernelFromBinary(const char* file, const char* kernel, voi
 
   cl_kernel clKernel = clCreateKernel(clProgram, kernel, &errcode);
   checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
-  
-  *(cl_program*) clProgramPtr = clProgram;
-  *(cl_kernel*)clKernelPtr = clKernel;
+
+  *(cl_program *)clProgramPtr = clProgram;
+  *(cl_kernel *)clKernelPtr = clKernel;
 
   free(programSource);
 }
 
-void
-pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) {
+void pb_AddSubTimer(struct pb_TimerSet *timers, char *label,
+                    enum pb_TimerID pb_Category) {
 
-  struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc
-    (sizeof(struct pb_SubTimer));
+  struct pb_SubTimer *subtimer =
+      (struct pb_SubTimer *)malloc(sizeof(struct pb_SubTimer));
 
   int len = strlen(label);
 
-  subtimer->label = (char *) malloc (sizeof(char)*(len+1));
+  subtimer->label = (char *)malloc(sizeof(char) * (len + 1));
   sprintf(subtimer->label, "%s\0", label);
 
   pb_ResetTimer(&subtimer->timer);
@@ -702,8 +660,8 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ
 
   struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category];
   if (subtimerlist == NULL) {
-    subtimerlist = (struct pb_SubTimerList *) calloc
-      (1, sizeof(struct pb_SubTimerList));
+    subtimerlist =
+        (struct pb_SubTimerList *)calloc(1, sizeof(struct pb_SubTimerList));
     subtimerlist->subtimer_list = subtimer;
     timers->sub_timer_list[pb_Category] = subtimerlist;
   } else {
@@ -714,21 +672,21 @@ pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Categ
     }
     element->next = subtimer;
   }
-
 }
 
-void
-pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
-{
+void pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) {
   /* Stop the currently running timer */
   if (timers->current != pb_TimerID_NONE) {
-    struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-    struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
+    struct pb_SubTimerList *subtimerlist =
+        timers->sub_timer_list[timers->current];
+    struct pb_SubTimer *currSubTimer =
+        (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
-    if (!is_async(timers->current) ) {
+    if (!is_async(timers->current)) {
       if (timers->current != timer) {
         if (currSubTimer != NULL) {
-          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
+          pb_StopTimerAndSubTimer(&timers->timers[timers->current],
+                                  &currSubTimer->timer);
         } else {
           pb_StopTimer(&timers->timers[timers->current]);
         }
@@ -750,30 +708,31 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
    * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(timer) ) ) {
+  if (asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(timer))) {
 
-    struct pb_async_time_marker_list * last_event = get_last_async(timers);
+    struct pb_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
     cl_int async_done = CL_COMPLETE;
 
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
+    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker),
+                              CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
+                              &async_done, NULL);
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stderr, "Error Querying EventInfo!\n");
     }
 
-
-    if(is_blocking(timer)) {
+    if (is_blocking(timer)) {
       /* Async operations completed after previous CPU operations:
        * overlapped time is the total CPU time since this set of async
        * operations were first issued */
 
       // timer to switch to is COPY or NONE
-      if(async_done != CL_COMPLETE) {
+      if (async_done != CL_COMPLETE) {
         accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed),
-	                  timers->async_begin,currentTime);
+                        timers->async_begin, currentTime);
       }
 
       /* Wait on async operation completion */
@@ -786,16 +745,17 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-      if(async_done == CL_COMPLETE) {
-        //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time);
+      if (async_done == CL_COMPLETE) {
+        // fprintf(stderr, "Async_done: total_async_type = %lld\n",
+        // total_async_time);
         timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;
       }
 
     } else
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == CL_COMPLETE ) {
+        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
+        // outstanding so something is deeper in stack
+        if (async_done == CL_COMPLETE) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
       timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
@@ -804,7 +764,7 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
 
   /* Start the new timer */
   if (timer != pb_TimerID_NONE) {
-    if(!is_async(timer)) {
+    if (!is_async(timer)) {
       pb_StartTimer(&timers->timers[timer]);
     } else {
       // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
@@ -813,13 +773,13 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
 
         insert_marker(timers, timer);
         timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
+      } else if (!is_async(timers->current)) {
         /* Previous asyncs still in flight, but a previous SwitchTo
          * already marked the end of the most recent async operation,
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct pb_async_time_marker_list * last_event = get_last_async(timers);
+        struct pb_async_time_marker_list *last_event = get_last_async(timers);
         last_event->label = NULL;
         last_event->timerID = timer;
       }
@@ -829,20 +789,21 @@ pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
     }
   }
   timers->current = timer;
-
 }
 
-void
-pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category)
-{
-  struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
-  struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
+void pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label,
+                         enum pb_TimerID category) {
+  struct pb_SubTimerList *subtimerlist =
+      timers->sub_timer_list[timers->current];
+  struct pb_SubTimer *curr =
+      (subtimerlist != NULL) ? subtimerlist->current : NULL;
 
   if (timers->current != pb_TimerID_NONE) {
-    if (!is_async(timers->current) ) {
+    if (!is_async(timers->current)) {
       if (timers->current != category) {
         if (curr != NULL) {
-          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
+          pb_StopTimerAndSubTimer(&timers->timers[timers->current],
+                                  &curr->timer);
         } else {
           pb_StopTimer(&timers->timers[timers->current]);
         }
@@ -864,32 +825,35 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
   /* The only cases we check for asynchronous task completion is
    * when an overlapping CPU operation completes, or the next
    * segment blocks on completion of previous async operations */
-  if( asyncs_outstanding(timers) &&
-      (!is_async(timers->current) || is_blocking(category) ) ) {
+  if (asyncs_outstanding(timers) &&
+      (!is_async(timers->current) || is_blocking(category))) {
 
-    struct pb_async_time_marker_list * last_event = get_last_async(timers);
+    struct pb_async_time_marker_list *last_event = get_last_async(timers);
     /* CL_COMPLETE if completed */
 
     cl_int ciErrNum = CL_SUCCESS;
     cl_int async_done = CL_COMPLETE;
 
-    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
+    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker),
+                              CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int),
+                              &async_done, NULL);
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stderr, "Error Querying EventInfo!\n");
     }
 
-    if(is_blocking(category)) {
+    if (is_blocking(category)) {
       /* Async operations completed after previous CPU operations:
        * overlapped time is the total CPU time since this set of async
        * operations were first issued */
 
       // timer to switch to is COPY or NONE
-      // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
-      // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
-      // it seems that the extra sync wall time isn't being recorded anywhere
-      if(async_done != CL_COMPLETE)
+      // if it hasn't already finished, then just take now and use that as the
+      // elapsed time in OVERLAP anything happening after now isn't OVERLAP
+      // because everything is being stopped to wait for synchronization it
+      // seems that the extra sync wall time isn't being recorded anywhere
+      if (async_done != CL_COMPLETE)
         accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed),
-	                  timers->async_begin,currentTime);
+                        timers->async_begin, currentTime);
 
       /* Wait on async operation completion */
       ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
@@ -900,16 +864,17 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
 
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
-       // If it did finish, then accumulate all the async time that did happen into OVERLAP
-       // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
-      if(async_done == CL_COMPLETE /*cudaSuccess*/)
+      // If it did finish, then accumulate all the async time that did happen
+      // into OVERLAP the immediately preceding EventSynchronize theoretically
+      // didn't have any effect since it was already completed.
+      if (async_done == CL_COMPLETE /*cudaSuccess*/)
         timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;
 
     } else
-    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
-    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
-    // so something is deeper in stack
-    if(async_done == CL_COMPLETE /*cudaSuccess*/) {
+        /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
+        // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are
+        // outstanding so something is deeper in stack
+        if (async_done == CL_COMPLETE /*cudaSuccess*/) {
       /* Async operations completed before previous CPU operations:
        * overlapped time is the total async time */
       timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
@@ -933,7 +898,7 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
 
   /* Start the new timer */
   if (category != pb_TimerID_NONE) {
-    if(!is_async(category)) {
+    if (!is_async(category)) {
       if (subtimerlist != NULL) {
         subtimerlist->current = subtimer;
       }
@@ -955,18 +920,19 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
         /* No asyncs outstanding, insert a fresh async marker */
         insert_submarker(timers, label, category);
         timers->async_begin = currentTime;
-      } else if(!is_async(timers->current)) {
+      } else if (!is_async(timers->current)) {
         /* Previous asyncs still in flight, but a previous SwitchTo
          * already marked the end of the most recent async operation,
          * so we can rename that marker as the beginning of this async
          * operation */
 
-        struct pb_async_time_marker_list * last_event = get_last_async(timers);
+        struct pb_async_time_marker_list *last_event = get_last_async(timers);
         last_event->timerID = category;
         last_event->label = label;
       } // else, marker for switchToThis was already inserted
 
-      //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
+      // toSwitchto is already asynchronous, but if current/prev state is async
+      // too, then DRIVER is already running
       if (!is_async(timers->current)) {
         pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]);
       }
@@ -976,36 +942,36 @@ pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID cat
   timers->current = category;
 }
 
-void
-pb_PrintTimerSet(struct pb_TimerSet *timers)
-{
+void pb_PrintTimerSet(struct pb_TimerSet *timers) {
   printf("Printing Parboil Timer: Default\n");
   pb_Timestamp wall_end = get_time();
 
   struct pb_Timer *t = timers->timers;
-  struct pb_SubTimer* sub = NULL;
+  struct pb_SubTimer *sub = NULL;
 
   int maxSubLength;
 
-//  const char *categories[] = {
-//    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
-//  };
+  //  const char *categories[] = {
+  //    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
+  //  };
   const char *categories[] = {
-    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute", "Overlap",
-    "Init_Ctx", "Clear_Ctx", "Copy_Scalar", "Copy_Ptr", "Mem_Free",
-    "Read_Output", "Setup", "Mem_Track", "Mem_Untrack", "Misc",
-    "Pthread_Create", "Arg_Pack", "Arg_Unpack", "Computation", "Output_Pack", "Output_Unpack"
-  };
-
+      "IO",          "Kernel",         "Copy",         "Driver",
+      "Copy Async",  "Compute",        "Overlap",      "Init_Ctx",
+      "Clear_Ctx",   "Copy_Scalar",    "Copy_Ptr",     "Mem_Free",
+      "Read_Output", "Setup",          "Mem_Track",    "Mem_Untrack",
+      "Misc",        "Pthread_Create", "Arg_Pack",     "Arg_Unpack",
+      "Computation", "Output_Pack",    "Output_Unpack"};
 
   const int maxCategoryLength = 20;
 
   int i;
-  for(i = 1; i < pb_TimerID_LAST; ++i) { // exclude NONE and OVRELAP from this format
-    if(pb_GetElapsedTime(&t[i]) != 0 || true) {
+  for (i = 1; i < pb_TimerID_LAST;
+       ++i) { // exclude NONE and OVRELAP from this format
+    if (pb_GetElapsedTime(&t[i]) != 0 || true) {
 
       // Print Category Timer
-      printf("%-*s: %.9f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i]));
+      printf("%-*s: %.9f\n", maxCategoryLength, categories[i - 1],
+             pb_GetElapsedTime(&t[i]));
 
       if (timers->sub_timer_list[i] != NULL) {
         sub = timers->sub_timer_list[i]->subtimer_list;
@@ -1020,47 +986,47 @@ pb_PrintTimerSet(struct pb_TimerSet *timers)
 
         // Fit to Categories
         if (maxSubLength <= maxCategoryLength) {
-         maxSubLength = maxCategoryLength;
+          maxSubLength = maxCategoryLength;
         }
 
         sub = timers->sub_timer_list[i]->subtimer_list;
 
         // Print SubTimers
         while (sub != NULL) {
-          printf(" -%-*s: %.9f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer));
+          printf(" -%-*s: %.9f\n", maxSubLength, sub->label,
+                 pb_GetElapsedTime(&sub->timer));
           sub = sub->next;
         }
       }
     }
   }
 
-  if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
-    printf("CPU/Kernel Overlap: %.9f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
+  if (pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
+    printf("CPU/Kernel Overlap: %.9f\n",
+           pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
 
-  float walltime = (wall_end - timers->wall_begin)/ 1e9;
+  float walltime = (wall_end - timers->wall_begin) / 1e9;
   printf("Timer Wall Time: %.9f\n", walltime);
-
 }
 
-void pb_DestroyTimerSet(struct pb_TimerSet * timers)
-{
+void pb_DestroyTimerSet(struct pb_TimerSet *timers) {
   /* clean up all of the async event markers */
-  struct pb_async_time_marker_list* event = timers->async_markers;
-  while(event != NULL) {
+  struct pb_async_time_marker_list *event = timers->async_markers;
+  while (event != NULL) {
 
     cl_int ciErrNum = CL_SUCCESS;
     ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker);
     if (ciErrNum != CL_SUCCESS) {
-      //fprintf(stderr, "Error Waiting for Events!\n");
+      // fprintf(stderr, "Error Waiting for Events!\n");
     }
 
-    ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) );
+    ciErrNum = clReleaseEvent(*((cl_event *)(event)->marker));
     if (ciErrNum != CL_SUCCESS) {
       fprintf(stderr, "Error Release Events!\n");
     }
 
     free((event)->marker);
-    struct pb_async_time_marker_list* next = ((event)->next);
+    struct pb_async_time_marker_list *next = ((event)->next);
 
     free(event);
 
@@ -1069,7 +1035,7 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers)
   }
 
   int i = 0;
-  for(i = 0; i < pb_TimerID_LAST; ++i) {
+  for (i = 0; i < pb_TimerID_LAST; ++i) {
     if (timers->sub_timer_list[i] != NULL) {
       struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
       struct pb_SubTimer *prev = NULL;
@@ -1083,5 +1049,3 @@ void pb_DestroyTimerSet(struct pb_TimerSet * timers)
     }
   }
 }
-
-
diff --git a/hpvm/test/pipeline/src/io.cc b/hpvm/test/pipeline/src/io.cc
index 045983722390eaa48deff0df0944dff481ee148a..04744f404ebaf6e669c2bbe91600519742b57dc9 100644
--- a/hpvm/test/pipeline/src/io.cc
+++ b/hpvm/test/pipeline/src/io.cc
@@ -10,47 +10,42 @@
  * layout
  */
 
-#include<fstream>
-#include<iostream>
-#include<vector>
+#include <fstream>
+#include <iostream>
+#include <vector>
 
-char* readFile(const char* fileName)
-{
-	std::fstream f(fileName,std::fstream::in);
-	if(!f.good())
-	{
-		std::cerr<<"Error Reading File!!"<<std::endl;
-		return NULL;
-	}
+char *readFile(const char *fileName) {
+  std::fstream f(fileName, std::fstream::in);
+  if (!f.good()) {
+    std::cerr << "Error Reading File!!" << std::endl;
+    return NULL;
+  }
+
+  f.seekg(0, std::ios::end);
+  int length = f.tellg();
+  f.seekg(0, std::ios::beg);
 
-	f.seekg(0,std::ios::end);
-	int length = f.tellg();
-	f.seekg(0,std::ios::beg);
+  char *buffer;
 
-	char* buffer;
+  if (length > 0) {
+    buffer = new char[length];
+    f.read(buffer, length);
+    buffer[length - 1] = 0;
+  } else {
+    buffer = new char;
+    buffer[0] = 0;
+  }
 
-	if(length>0)
-	{
-		buffer = new char[length];
-		f.read(buffer,length);
-		buffer[length-1]=0;
-	}
-	else
-	{
-		buffer = new char;
-		buffer[0] = 0;
-	}
-	
-	f.close();
+  f.close();
 
-	return buffer;
+  return buffer;
 }
 
-bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << std::endl;
+bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col,
+                            std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << std::endl;
   std::fstream f(fn, std::fstream::in);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
@@ -59,33 +54,31 @@ bool readColMajorMatrixFile(const char *fn, int &nr_row, int &nr_col, std::vecto
   f >> nr_col;
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
-  while (f.good() ) {
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
+  while (f.good()) {
     f >> data;
     v.push_back(data);
   }
   v.pop_back(); // remove the duplicated last element
   return true;
-
 }
 
-bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col, std::vector<float>&v)
-{
-  std::cerr << "Opening file:"<< fn << " for write." << std::endl;
+bool writeColMajorMatrixFile(const char *fn, int nr_row, int nr_col,
+                             std::vector<float> &v) {
+  std::cerr << "Opening file:" << fn << " for write." << std::endl;
   std::fstream f(fn, std::fstream::out);
-  if ( !f.good() ) {
+  if (!f.good()) {
     return false;
   }
 
   // Read # of rows and cols
-  f << nr_row << " "<<nr_col<<" ";
+  f << nr_row << " " << nr_col << " ";
 
   float data;
-  std::cerr << "Matrix dimension: "<<nr_row<<"x"<<nr_col<<std::endl;
+  std::cerr << "Matrix dimension: " << nr_row << "x" << nr_col << std::endl;
   for (int i = 0; i < v.size(); ++i) {
     f << v[i] << ' ';
   }
   f << "\n";
   return true;
-
 }
diff --git a/hpvm/test/pipeline/src/main.cc b/hpvm/test/pipeline/src/main.cc
index 0d2582d41be135645155676b988e44538c472273..9314833d25d0a3a25f13dfb24fb8a239b94956b1 100644
--- a/hpvm/test/pipeline/src/main.cc
+++ b/hpvm/test/pipeline/src/main.cc
@@ -10,52 +10,47 @@
  * Main entry of dense matrix-matrix multiplication kernel
  */
 
-#include "opencv2/opencv.hpp"
 #include "opencv2/ocl/ocl.hpp"
-#include <stdio.h>
+#include "opencv2/opencv.hpp"
+#include <cassert>
+#include <iostream>
+#include <malloc.h>
 #include <math.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/time.h>
-#include <malloc.h>
-#include <iostream>
-#include <cassert>
 #include <visc.h>
 
-
-#define NUM_RUNS 100 
+#define NUM_RUNS 100
 #define DEPTH 3
 #define HEIGHT 640
 #define WIDTH 480
 
-std::string input_window =  "GPU Pipeline - Input Video";
+std::string input_window = "GPU Pipeline - Input Video";
 std::string output_window = "GPU Pipeline - Edge Mapping";
 
-
 #ifdef MIDDLE
-  #define POSX_IN     640
-  #define POSY_IN     0
-  #define POSX_OUT    640
-  #define POSY_OUT    540
+#define POSX_IN 640
+#define POSY_IN 0
+#define POSX_OUT 640
+#define POSY_OUT 540
 
 #elif RIGHT
-  #define POSX_IN     1280
-  #define POSY_IN     0
-  #define POSX_OUT    1280
-  #define POSY_OUT    540
+#define POSX_IN 1280
+#define POSY_IN 0
+#define POSX_OUT 1280
+#define POSY_OUT 540
 
 #else // LEFT
-  #define POSX_IN     0
-  #define POSY_IN     0
-  #define POSX_OUT    0
-  #define POSY_OUT    540
+#define POSX_IN 0
+#define POSY_IN 0
+#define POSX_OUT 0
+#define POSY_OUT 540
 #endif
 
-
 //#define NUM_FRAMES 20
 
-
-
 // Definitions of sizes for edge detection kernels
 
 #define MIN_BR 0.0f
@@ -66,33 +61,33 @@ std::string output_window = "GPU Pipeline - Edge Mapping";
 
 #define REDUCTION_TILE_SZ 1024
 
-#define _MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define _MAX(X,Y) ((X) > (Y) ? (X) : (Y))
+#define _MIN(X, Y) ((X) < (Y) ? (X) : (Y))
+#define _MAX(X, Y) ((X) > (Y) ? (X) : (Y))
 
 extern "C" {
 
 struct __attribute__((__packed__)) InStruct {
-  float* I ;
+  float *I;
   size_t bytesI;
-  float* Is ;
+  float *Is;
   size_t bytesIs;
-  float* L;
+  float *L;
   size_t bytesL;
-  float* S;
+  float *S;
   size_t bytesS;
-  float* G;
+  float *G;
   size_t bytesG;
-  float* maxG;
+  float *maxG;
   size_t bytesMaxG;
-  float* E;
+  float *E;
   size_t bytesE;
-  float* Gs;
+  float *Gs;
   size_t bytesGs;
-  float* B;
+  float *B;
   size_t bytesB;
-  float* Sx;
+  float *Sx;
   size_t bytesSx;
-  float* Sy;
+  float *Sy;
   size_t bytesSy;
   long m;
   long n;
@@ -100,20 +95,12 @@ struct __attribute__((__packed__)) InStruct {
   long grid_x;
 };
 
-
-void packData(struct InStruct* args, float* I, size_t bytesI,
-                                     float* Is, size_t bytesIs,
-                                     float* L, size_t bytesL,
-                                     float* S, size_t bytesS,
-                                     float* G, size_t bytesG,
-                                     float* maxG, size_t bytesMaxG,
-                                     float* E, size_t bytesE,
-                                     float* Gs, size_t bytesGs,
-                                     float* B, size_t bytesB,
-                                     float* Sx, size_t bytesSx,
-                                     float* Sy, size_t bytesSy,
-                                     long m, long n,
-                                     long block_x, long grid_x) {
+void packData(struct InStruct *args, float *I, size_t bytesI, float *Is,
+              size_t bytesIs, float *L, size_t bytesL, float *S, size_t bytesS,
+              float *G, size_t bytesG, float *maxG, size_t bytesMaxG, float *E,
+              size_t bytesE, float *Gs, size_t bytesGs, float *B, size_t bytesB,
+              float *Sx, size_t bytesSx, float *Sy, size_t bytesSy, long m,
+              long n, long block_x, long grid_x) {
   args->I = I;
   args->bytesI = bytesI;
   args->Is = Is;
@@ -142,13 +129,13 @@ void packData(struct InStruct* args, float* I, size_t bytesI,
   args->grid_x = grid_x;
 }
 
-/* 
+/*
  * Gaussian smoothing of image I of size m x n
  * I : input image
  * Gs : gaussian filter
  * Is: output (smoothed image)
  * m, n : dimensions
- * 
+ *
  * Need 2D grid, a thread per pixel
  * No use of separable algorithm because we need to do this in one kernel
  * No use of shared memory because
@@ -157,19 +144,17 @@ void packData(struct InStruct* args, float* I, size_t bytesI,
 
 #define GAUSSIAN_SIZE 7
 #define GAUSSIAN_RADIUS (GAUSSIAN_SIZE / 2)
-void gaussianSmoothing(float *I, size_t bytesI,
-                       float *Gs, size_t bytesGs,
-                       float *Is, size_t bytesIs,
-                       long m, long n) {
+void gaussianSmoothing(float *I, size_t bytesI, float *Gs, size_t bytesGs,
+                       float *Is, size_t bytesIs, long m, long n) {
 
   __visc__hint(visc::DEVICE);
   __visc__attributes(2, I, Gs, 1, Is);
 
-  void* thisNode = __visc__getNode();
+  void *thisNode = __visc__getNode();
   long gx = __visc__getNodeInstanceID_x(thisNode);
   long gy = __visc__getNodeInstanceID_y(thisNode);
 
-  int gloc = gx + gy*n;
+  int gloc = gx + gy * n;
 
   float smoothedVal = 0;
   float gval;
@@ -179,37 +164,38 @@ void gaussianSmoothing(float *I, size_t bytesI,
     for (int i = -GAUSSIAN_RADIUS; i <= GAUSSIAN_RADIUS; i++)
       for (int j = -GAUSSIAN_RADIUS; j <= GAUSSIAN_RADIUS; j++) {
 
-        loadOffset = gloc + i*n + j;
-      
+        loadOffset = gloc + i * n + j;
+
         if ((gy + i) < 0) // top contour
           loadOffset = gx + j;
-        else if ((gy + i) > m-1 ) // bottom contour
-          loadOffset = (m-1)*n + gx + j;
-        else 
-          loadOffset = gloc + i*n + j; // within image vertically
+        else if ((gy + i) > m - 1) // bottom contour
+          loadOffset = (m - 1) * n + gx + j;
+        else
+          loadOffset = gloc + i * n + j; // within image vertically
 
         // Adjust so we are within image horizonally
         if ((gx + j) < 0) // left contour
-          loadOffset -= (gx+j);
-        else if ((gx + j) > n-1 ) // right contour
+          loadOffset -= (gx + j);
+        else if ((gx + j) > n - 1) // right contour
           loadOffset = loadOffset - gx - j + n - 1;
 
         gval = I[loadOffset];
-        smoothedVal += gval * Gs[(GAUSSIAN_RADIUS + i)*GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j];
+        smoothedVal +=
+            gval *
+            Gs[(GAUSSIAN_RADIUS + i) * GAUSSIAN_SIZE + GAUSSIAN_RADIUS + j];
       }
-  
+
     Is[gloc] = smoothedVal;
   }
   __visc__return(2, bytesIs, bytesIs);
 }
 
-void WrapperGaussianSmoothing(float *I, size_t bytesI,
-                       float *Gs, size_t bytesGs,
-                       float *Is, size_t bytesIs,
-                       long m, long n) {
+void WrapperGaussianSmoothing(float *I, size_t bytesI, float *Gs,
+                              size_t bytesGs, float *Is, size_t bytesIs, long m,
+                              long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, I, Gs, 1, Is);
-  void* GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n);
+  void *GSNode = __visc__createNodeND(2, gaussianSmoothing, m, n);
   __visc__bindIn(GSNode, 0, 0, 0); // Bind I
   __visc__bindIn(GSNode, 1, 1, 0); // Bind bytesI
   __visc__bindIn(GSNode, 2, 2, 0); // Bind Gs
@@ -223,7 +209,6 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI,
   __visc__bindOut(GSNode, 1, 1, 0); // bind output bytesIs
 }
 
-
 /* Compute a non-linear laplacian estimate of input image I of size m x n */
 /*
  * Is   : blurred imput image
@@ -231,88 +216,99 @@ void WrapperGaussianSmoothing(float *I, size_t bytesI,
  * B    : structural element for dilation - erosion ([0 1 0; 1 1 1; 0 1 0])
  * L    : output (laplacian of the image)
  * Need 2D grid, a thread per pixel
-*/
-void laplacianEstimate(float *Is, size_t bytesIs,
-                            float *B, size_t bytesB,
-                            float *L, size_t bytesL,
-                            long m, long n) {
+ */
+void laplacianEstimate(float *Is, size_t bytesIs, float *B, size_t bytesB,
+                       float *L, size_t bytesL, long m, long n) {
 
   __visc__hint(visc::DEVICE);
   __visc__attributes(2, Is, B, 1, L);
   // 3x3 image area
-  float imageArea[SZB*SZB];
+  float imageArea[SZB * SZB];
 
-  void* thisNode = __visc__getNode();
+  void *thisNode = __visc__getNode();
   long gx = __visc__getNodeInstanceID_x(thisNode);
   long gy = __visc__getNodeInstanceID_y(thisNode);
   int i, j;
 
   if ((gx < n) && (gy < m)) {
     // Data copy for dilation filter
-    imageArea[1 * SZB +1] = Is[gy * n + gx];
+    imageArea[1 * SZB + 1] = Is[gy * n + gx];
 
     if (gx == 0) {
-      imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MIN_BR;
+      imageArea[0 * SZB + 0] = imageArea[1 * SZB + 0] = imageArea[2 * SZB + 0] =
+          MIN_BR;
     } else {
-      imageArea[1 * SZB +0] = Is[gy * n + gx - 1];
-      imageArea[0 * SZB +0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
-      imageArea[2 * SZB +0] = (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
+      imageArea[1 * SZB + 0] = Is[gy * n + gx - 1];
+      imageArea[0 * SZB + 0] = (gy > 0) ? Is[(gy - 1) * n + gx - 1] : MIN_BR;
+      imageArea[2 * SZB + 0] =
+          (gy < m - 1) ? Is[(gy + 1) * n + gx - 1] : MIN_BR;
     }
 
     if (gx == n - 1) {
-      imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MIN_BR;
+      imageArea[0 * SZB + 2] = imageArea[1 * SZB + 2] = imageArea[2 * SZB + 2] =
+          MIN_BR;
     } else {
-      imageArea[1 * SZB +2] = Is[gy * n + gx + 1];
-      imageArea[0 * SZB +2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
-      imageArea[2 * SZB +2] = (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
+      imageArea[1 * SZB + 2] = Is[gy * n + gx + 1];
+      imageArea[0 * SZB + 2] = (gy > 0) ? Is[(gy - 1) * n + gx + 1] : MIN_BR;
+      imageArea[2 * SZB + 2] =
+          (gy < m - 1) ? Is[(gy + 1) * n + gx + 1] : MIN_BR;
     }
 
-    imageArea[0 * SZB +1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
-    imageArea[2 * SZB +1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
+    imageArea[0 * SZB + 1] = (gy > 0) ? Is[(gy - 1) * n + gx] : MIN_BR;
+    imageArea[2 * SZB + 1] = (gy < m - 1) ? Is[(gy + 1) * n + gx] : MIN_BR;
 
     // Compute pixel of dilated image
     float dilatedPixel = MIN_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        dilatedPixel = _MAX(dilatedPixel, imageArea[i * SZB +j] * B[i*SZB + j]);
+        dilatedPixel =
+            _MAX(dilatedPixel, imageArea[i * SZB + j] * B[i * SZB + j]);
 
     // Data copy for erotion filter - only change the boundary conditions
     if (gx == 0) {
-      imageArea[0 * SZB +0] = imageArea[1 * SZB +0] = imageArea[2 * SZB +0] = MAX_BR;
+      imageArea[0 * SZB + 0] = imageArea[1 * SZB + 0] = imageArea[2 * SZB + 0] =
+          MAX_BR;
     } else {
-      if (gy == 0) imageArea[0 * SZB +0] = MAX_BR;
-      if (gy == m-1) imageArea[2 * SZB +0] = MAX_BR;
+      if (gy == 0)
+        imageArea[0 * SZB + 0] = MAX_BR;
+      if (gy == m - 1)
+        imageArea[2 * SZB + 0] = MAX_BR;
     }
 
     if (gx == n - 1) {
-      imageArea[0 * SZB +2] = imageArea[1 * SZB +2] = imageArea[2 * SZB +2] = MAX_BR;
+      imageArea[0 * SZB + 2] = imageArea[1 * SZB + 2] = imageArea[2 * SZB + 2] =
+          MAX_BR;
     } else {
-      if (gy == 0) imageArea[0 * SZB +2] = MAX_BR;
-      if (gy == m-1) imageArea[2 * SZB +2] = MAX_BR;
+      if (gy == 0)
+        imageArea[0 * SZB + 2] = MAX_BR;
+      if (gy == m - 1)
+        imageArea[2 * SZB + 2] = MAX_BR;
     }
 
-    if (gy == 0) imageArea[0 * SZB +1] = MAX_BR;
-    if (gy == m-1) imageArea[2 * SZB +1] = MAX_BR;
+    if (gy == 0)
+      imageArea[0 * SZB + 1] = MAX_BR;
+    if (gy == m - 1)
+      imageArea[2 * SZB + 1] = MAX_BR;
 
     // Compute pixel of eroded image
     float erodedPixel = MAX_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        erodedPixel = _MIN(erodedPixel, imageArea[i * SZB +j] * B[i*SZB + j]);
+        erodedPixel =
+            _MIN(erodedPixel, imageArea[i * SZB + j] * B[i * SZB + j]);
 
-    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB +1];
-    L[gy*n+gx] = laplacian;
+    float laplacian = dilatedPixel + erodedPixel - 2 * imageArea[1 * SZB + 1];
+    L[gy * n + gx] = laplacian;
   }
   __visc__return(1, bytesL);
 }
 
-void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
-                          float *B, size_t bytesB,
-                          float *L, size_t bytesL,
-                          long m, long n) {
+void WrapperlaplacianEstimate(float *Is, size_t bytesIs, float *B,
+                              size_t bytesB, float *L, size_t bytesL, long m,
+                              long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, Is, B, 1, L);
-  void* LNode = __visc__createNodeND(2, laplacianEstimate, m, n);
+  void *LNode = __visc__createNodeND(2, laplacianEstimate, m, n);
   __visc__bindIn(LNode, 0, 0, 0); // Bind Is
   __visc__bindIn(LNode, 1, 1, 0); // Bind bytesIs
   __visc__bindIn(LNode, 2, 2, 0); // Bind B
@@ -323,7 +319,6 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
   __visc__bindIn(LNode, 7, 7, 0); // Bind n
 
   __visc__bindOut(LNode, 0, 0, 0); // bind output bytesL
-
 }
 
 /* Compute the zero crossings of input image L of size m x n */
@@ -334,10 +329,8 @@ void WrapperlaplacianEstimate(float *Is, size_t bytesIs,
  * S    : output (sign of the image)
  * Need 2D grid, a thread per pixel
  */
-void computeZeroCrossings(float *L, size_t bytesL,
-                          float *B, size_t bytesB,
-                          float *S, size_t bytesS,
-                          long m, long n) {
+void computeZeroCrossings(float *L, size_t bytesL, float *B, size_t bytesB,
+                          float *S, size_t bytesS, long m, long n) {
   __visc__hint(visc::DEVICE);
   //__visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
@@ -345,89 +338,93 @@ void computeZeroCrossings(float *L, size_t bytesL,
   // 3x3 image area
   float imageArea[SZB][SZB];
 
-  void* thisNode = __visc__getNode();
+  void *thisNode = __visc__getNode();
   long gx = __visc__getNodeInstanceID_x(thisNode);
   long gy = __visc__getNodeInstanceID_y(thisNode);
   int i, j;
 
   if ((gx < n) && (gy < m)) {
     // Data copy for dilation filter
-    imageArea[1][1] = L[gy * n + gx] > MIN_BR? MAX_BR : MIN_BR;
+    imageArea[1][1] = L[gy * n + gx] > MIN_BR ? MAX_BR : MIN_BR;
 
     if (gx == 0) { // left most line
       imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MIN_BR;
-    } else { 
-      imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR;
-      imageArea[0][0] = (gy > 0) ?
-                            (L[(gy - 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
-                            : MIN_BR;
-      imageArea[2][0] = (gy < m - 1) ?
-                            (L[(gy + 1) * n + gx - 1] > MIN_BR? MAX_BR : MIN_BR)
-                            : MIN_BR;
+    } else {
+      imageArea[1][0] = L[gy * n + gx - 1] > MIN_BR ? MAX_BR : MIN_BR;
+      imageArea[0][0] =
+          (gy > 0) ? (L[(gy - 1) * n + gx - 1] > MIN_BR ? MAX_BR : MIN_BR)
+                   : MIN_BR;
+      imageArea[2][0] =
+          (gy < m - 1) ? (L[(gy + 1) * n + gx - 1] > MIN_BR ? MAX_BR : MIN_BR)
+                       : MIN_BR;
     }
 
     if (gx == n - 1) {
       imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MIN_BR;
     } else {
-      imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR;
-      imageArea[0][2] = (gy > 0) ?
-                            (L[(gy - 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
-                            : MIN_BR;
-      imageArea[2][2] = (gy < m - 1) ?
-                            (L[(gy + 1) * n + gx + 1] > MIN_BR? MAX_BR : MIN_BR)
-                            : MIN_BR;
+      imageArea[1][2] = L[gy * n + gx + 1] > MIN_BR ? MAX_BR : MIN_BR;
+      imageArea[0][2] =
+          (gy > 0) ? (L[(gy - 1) * n + gx + 1] > MIN_BR ? MAX_BR : MIN_BR)
+                   : MIN_BR;
+      imageArea[2][2] =
+          (gy < m - 1) ? (L[(gy + 1) * n + gx + 1] > MIN_BR ? MAX_BR : MIN_BR)
+                       : MIN_BR;
     }
 
-    imageArea[0][1] = (gy > 0) ?
-                          (L[(gy - 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
-                          : MIN_BR;
-    imageArea[2][1] = (gy < m - 1)?
-                          (L[(gy + 1) * n + gx] > MIN_BR? MAX_BR : MIN_BR)
+    imageArea[0][1] =
+        (gy > 0) ? (L[(gy - 1) * n + gx] > MIN_BR ? MAX_BR : MIN_BR) : MIN_BR;
+    imageArea[2][1] = (gy < m - 1)
+                          ? (L[(gy + 1) * n + gx] > MIN_BR ? MAX_BR : MIN_BR)
                           : MIN_BR;
 
     // Compute pixel of dilated image
     float dilatedPixel = MIN_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i*SZB + j]);
+        dilatedPixel = _MAX(dilatedPixel, imageArea[i][j] * B[i * SZB + j]);
 
     // Data copy for erotion filter - only change the boundary conditions
     if (gx == 0) {
       imageArea[0][0] = imageArea[1][0] = imageArea[2][0] = MAX_BR;
     } else {
-      if (gy == 0) imageArea[0][0] = MAX_BR;
-      if (gy == m-1) imageArea[2][0] = MAX_BR;
+      if (gy == 0)
+        imageArea[0][0] = MAX_BR;
+      if (gy == m - 1)
+        imageArea[2][0] = MAX_BR;
     }
 
     if (gx == n - 1) {
       imageArea[0][2] = imageArea[1][2] = imageArea[2][2] = MAX_BR;
     } else {
-      if (gy == 0) imageArea[0][2] = MAX_BR;
-      if (gy == m-1) imageArea[2][2] = MAX_BR;
+      if (gy == 0)
+        imageArea[0][2] = MAX_BR;
+      if (gy == m - 1)
+        imageArea[2][2] = MAX_BR;
     }
 
-    if (gy == 0) imageArea[0][1] = MAX_BR;
-    if (gy == m-1) imageArea[2][1] = MAX_BR;
+    if (gy == 0)
+      imageArea[0][1] = MAX_BR;
+    if (gy == m - 1)
+      imageArea[2][1] = MAX_BR;
 
     // Compute pixel of eroded image
     float erodedPixel = MAX_BR;
     for (i = 0; i < SZB; i++)
       for (j = 0; j < SZB; j++)
-        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i*SZB + j]);
+        erodedPixel = _MIN(erodedPixel, imageArea[i][j] * B[i * SZB + j]);
 
     float pixelSign = dilatedPixel - erodedPixel;
-    S[gy*n+gx] = pixelSign;
+    S[gy * n + gx] = pixelSign;
   }
-  __visc__return(1, bytesS); 
+  __visc__return(1, bytesS);
 }
 
-void WrapperComputeZeroCrossings(float *L, size_t bytesL,
-                          float *B, size_t bytesB,
-                          float *S, size_t bytesS,
-                          long m, long n) {
+void WrapperComputeZeroCrossings(float *L, size_t bytesL, float *B,
+                                 size_t bytesB, float *S, size_t bytesS, long m,
+                                 long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, L, B, 1, S);
-  void* ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n);
+  void *ZCNode = __visc__createNodeND(2, computeZeroCrossings, m, n);
   __visc__bindIn(ZCNode, 0, 0, 0); // Bind L
   __visc__bindIn(ZCNode, 1, 1, 0); // Bind bytesL
   __visc__bindIn(ZCNode, 2, 2, 0); // Bind B
@@ -438,7 +435,6 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL,
   __visc__bindIn(ZCNode, 7, 7, 0); // Bind n
 
   __visc__bindOut(ZCNode, 0, 0, 0); // bind output bytesS
-
 }
 
 /*
@@ -458,20 +454,18 @@ void WrapperComputeZeroCrossings(float *L, size_t bytesL,
 #define SOBEL_SIZE 3
 #define SOBEL_RADIUS (SOBEL_SIZE / 2)
 
-void computeGradient(float *Is, size_t bytesIs,
-              float *Sx, size_t bytesSx,
-              float *Sy, size_t bytesSy,
-              float *G, size_t bytesG,
-              long m, long n) {
+void computeGradient(float *Is, size_t bytesIs, float *Sx, size_t bytesSx,
+                     float *Sy, size_t bytesSy, float *G, size_t bytesG, long m,
+                     long n) {
 
   __visc__hint(visc::DEVICE);
   __visc__attributes(3, Is, Sx, Sy, 1, G);
 
-  void* thisNode = __visc__getNode();
+  void *thisNode = __visc__getNode();
   long gx = __visc__getNodeInstanceID_x(thisNode);
   long gy = __visc__getNodeInstanceID_y(thisNode);
 
-  int gloc = gx + gy*n;
+  int gloc = gx + gy * n;
 
   float Gx = 0;
   float Gy = 0;
@@ -482,39 +476,37 @@ void computeGradient(float *Is, size_t bytesIs,
     for (int i = -SOBEL_RADIUS; i <= SOBEL_RADIUS; i++)
       for (int j = -SOBEL_RADIUS; j <= SOBEL_RADIUS; j++) {
 
-        loadOffset = gloc + i*n + j;
-      
+        loadOffset = gloc + i * n + j;
+
         if ((gy + i) < 0) // top contour
           loadOffset = gx + j;
-        else if ((gy + i) > m-1 ) // bottom contour
-          loadOffset = (m-1)*n + gx + j;
-        else 
-          loadOffset = gloc + i*n + j; // within image vertically
+        else if ((gy + i) > m - 1) // bottom contour
+          loadOffset = (m - 1) * n + gx + j;
+        else
+          loadOffset = gloc + i * n + j; // within image vertically
 
         // Adjust so we are within image horizonally
         if ((gx + j) < 0) // left contour
-          loadOffset -= (gx+j);
-        else if ((gx + j) > n-1 ) // right contour
+          loadOffset -= (gx + j);
+        else if ((gx + j) > n - 1) // right contour
           loadOffset = loadOffset - gx - j + n - 1;
 
         gval = Is[loadOffset];
-        Gx += gval * Sx[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
-        Gy += gval * Sy[(SOBEL_RADIUS + i)*SOBEL_SIZE + SOBEL_RADIUS + j];
+        Gx += gval * Sx[(SOBEL_RADIUS + i) * SOBEL_SIZE + SOBEL_RADIUS + j];
+        Gy += gval * Sy[(SOBEL_RADIUS + i) * SOBEL_SIZE + SOBEL_RADIUS + j];
       }
 
-    G[gloc] = sqrt(Gx*Gx + Gy*Gy);
+    G[gloc] = sqrt(Gx * Gx + Gy * Gy);
   }
   __visc__return(1, bytesG);
 }
 
-void WrapperComputeGradient(float *Is, size_t bytesIs,
-                                 float *Sx, size_t bytesSx,
-                                 float *Sy, size_t bytesSy,
-                                 float *G, size_t bytesG,
-                                 long m, long n) {
+void WrapperComputeGradient(float *Is, size_t bytesIs, float *Sx,
+                            size_t bytesSx, float *Sy, size_t bytesSy, float *G,
+                            size_t bytesG, long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, Is, Sx, Sy, 1, G);
-  void* CGNode = __visc__createNodeND(2, computeGradient, m, n);
+  void *CGNode = __visc__createNodeND(2, computeGradient, m, n);
   __visc__bindIn(CGNode, 0, 0, 0); // Bind Is
   __visc__bindIn(CGNode, 1, 1, 0); // Bind bytesIs
   __visc__bindIn(CGNode, 2, 2, 0); // Bind Sx
@@ -529,35 +521,34 @@ void WrapperComputeGradient(float *Is, size_t bytesIs,
   __visc__bindOut(CGNode, 0, 0, 0); // bind output bytesG
 }
 
-/* 
+/*
  * Reduction
  * G : input
  * maxG: output
  * m, n: input size
  * Needs a single thread block
  */
-void computeMaxGradientLeaf(float *G, size_t bytesG,
-                            float *maxG, size_t bytesMaxG,
-                            long m, long n) {
+void computeMaxGradientLeaf(float *G, size_t bytesG, float *maxG,
+                            size_t bytesMaxG, long m, long n) {
 
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(1, G, 1, maxG);
 
-  void* thisNode = __visc__getNode();
+  void *thisNode = __visc__getNode();
 
-  long lx = __visc__getNodeInstanceID_x(thisNode);       // threadIdx.x
-  long dimx = __visc__getNumNodeInstances_x(thisNode);   // blockDim.x
+  long lx = __visc__getNodeInstanceID_x(thisNode);     // threadIdx.x
+  long dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x
 
   // Assume a single thread block
   // Thread block iterates over all elements
-  for (int i = lx + dimx; i < m*n; i+= dimx) {
+  for (int i = lx + dimx; i < m * n; i += dimx) {
     if (G[lx] < G[i])
       G[lx] = G[i];
   }
 
   // First thread iterates over all elements of the thread block
-  long bounds = dimx < m*n ? dimx : m*n;
-	if (lx == 0) {
+  long bounds = dimx < m * n ? dimx : m * n;
+  if (lx == 0) {
     for (int i = 1; i < bounds; i++)
       if (G[lx] < G[i])
         G[lx] = G[i];
@@ -568,13 +559,11 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
   __visc__return(1, bytesMaxG);
 }
 
-void computeMaxGradientTB(float *G, size_t bytesG,
-                           float *maxG, size_t bytesMaxG,
-                           long m, long n,
-                           long block_x) {
+void computeMaxGradientTB(float *G, size_t bytesG, float *maxG,
+                          size_t bytesMaxG, long m, long n, long block_x) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
-  void* CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x);
+  void *CMGLeafNode = __visc__createNodeND(1, computeMaxGradientLeaf, block_x);
   __visc__bindIn(CMGLeafNode, 0, 0, 0); // Bind G
   __visc__bindIn(CMGLeafNode, 1, 1, 0); // Bind bytesG
   __visc__bindIn(CMGLeafNode, 2, 2, 0); // Bind maxG
@@ -585,13 +574,12 @@ void computeMaxGradientTB(float *G, size_t bytesG,
   __visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output bytesMaxG
 }
 
-void WrapperComputeMaxGradient(float *G, size_t bytesG,
-                           float *maxG, size_t bytesMaxG,
-                           long m, long n,
-                           long block_x, long grid_x) {
+void WrapperComputeMaxGradient(float *G, size_t bytesG, float *maxG,
+                               size_t bytesMaxG, long m, long n, long block_x,
+                               long grid_x) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(2, G, maxG, 1, maxG);
-  void* CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x);
+  void *CMGTBNode = __visc__createNodeND(1, computeMaxGradientTB, grid_x);
   __visc__bindIn(CMGTBNode, 0, 0, 0); // Bind G
   __visc__bindIn(CMGTBNode, 1, 1, 0); // Bind bytesG
   __visc__bindIn(CMGTBNode, 2, 2, 0); // Bind maxG
@@ -613,327 +601,307 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG,
  */
 
 #define THETA 0.1
-void rejectZeroCrossings(float *S, size_t bytesS,
-                         float *G, size_t bytesG,
-                         float *maxG, size_t bytesMaxG,
-                         float *E, size_t bytesE,
+void rejectZeroCrossings(float *S, size_t bytesS, float *G, size_t bytesG,
+                         float *maxG, size_t bytesMaxG, float *E, size_t bytesE,
                          long m, long n) {
   __visc__hint(visc::DEVICE);
   __visc__attributes(3, S, G, maxG, 1, E);
 
-  void* thisNode = __visc__getNode();
+  void *thisNode = __visc__getNode();
   int gx = __visc__getNodeInstanceID_x(thisNode);
   int gy = __visc__getNodeInstanceID_y(thisNode);
 
   float mG = *maxG;
   if ((gx < n) && (gy < m)) {
-    E[gy*n+gx] = ((S[gy*n+gx] > 0.0) && (G[gy*n+gx] > THETA*mG)) ? 1.0 : 0.0 ;
+    E[gy * n + gx] =
+        ((S[gy * n + gx] > 0.0) && (G[gy * n + gx] > THETA * mG)) ? 1.0 : 0.0;
   }
   __visc__return(1, bytesE);
 }
 
-void WrapperRejectZeroCrossings(float *S, size_t bytesS,
-                         float *G, size_t bytesG,
-                         float *maxG, size_t bytesMaxG,
-                         float *E, size_t bytesE,
-                         long m, long n) {
+void WrapperRejectZeroCrossings(float *S, size_t bytesS, float *G,
+                                size_t bytesG, float *maxG, size_t bytesMaxG,
+                                float *E, size_t bytesE, long m, long n) {
   __visc__hint(visc::CPU_TARGET);
   __visc__attributes(3, S, G, maxG, 1, E);
-  void* RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n);
-  __visc__bindIn(RZCNode, 0, 0 , 0); // Bind S
-  __visc__bindIn(RZCNode, 1, 1 , 0); // Bind bytesS
-  __visc__bindIn(RZCNode, 2, 2 , 0); // Bind G
-  __visc__bindIn(RZCNode, 3, 3 , 0); // Bind bytesG
-  __visc__bindIn(RZCNode, 4, 4 , 0); // Bind maxG
-  __visc__bindIn(RZCNode, 5, 5 , 0); // Bind bytesMaxG
-  __visc__bindIn(RZCNode, 6, 6 , 0); // Bind E
-  __visc__bindIn(RZCNode, 7, 7 , 0); // Bind bytesE
-  __visc__bindIn(RZCNode, 8, 8 , 0); // Bind m
+  void *RZCNode = __visc__createNodeND(2, rejectZeroCrossings, m, n);
+  __visc__bindIn(RZCNode, 0, 0, 0); // Bind S
+  __visc__bindIn(RZCNode, 1, 1, 0); // Bind bytesS
+  __visc__bindIn(RZCNode, 2, 2, 0); // Bind G
+  __visc__bindIn(RZCNode, 3, 3, 0); // Bind bytesG
+  __visc__bindIn(RZCNode, 4, 4, 0); // Bind maxG
+  __visc__bindIn(RZCNode, 5, 5, 0); // Bind bytesMaxG
+  __visc__bindIn(RZCNode, 6, 6, 0); // Bind E
+  __visc__bindIn(RZCNode, 7, 7, 0); // Bind bytesE
+  __visc__bindIn(RZCNode, 8, 8, 0); // Bind m
   __visc__bindIn(RZCNode, 9, 9, 0); // Bind n
 
   __visc__bindOut(RZCNode, 0, 0, 0); // bind output bytesE
 }
 
-
-
 // Pipelined Root node
-void edgeDetection(float *I, size_t bytesI, // 0
-                   float *Is, size_t bytesIs, // 2
-                   float *L, size_t bytesL, // 4
-                   float *S, size_t bytesS, // 6
-                   float *G, size_t bytesG, // 8
+void edgeDetection(float *I, size_t bytesI,       // 0
+                   float *Is, size_t bytesIs,     // 2
+                   float *L, size_t bytesL,       // 4
+                   float *S, size_t bytesS,       // 6
+                   float *G, size_t bytesG,       // 8
                    float *maxG, size_t bytesMaxG, // 10
-                   float *E, size_t bytesE, // 12
-                   float *Gs, size_t bytesGs, // 14
-                   float *B, size_t bytesB, // 16
-                   float *Sx, size_t bytesSx, // 18
-                   float *Sy, size_t bytesSy, // 20
-                   long m, // 22
-                   long n, // 23
-                   long block_x, // 24
-                   long grid_x // 25
-                   ) { 
+                   float *E, size_t bytesE,       // 12
+                   float *Gs, size_t bytesGs,     // 14
+                   float *B, size_t bytesB,       // 16
+                   float *Sx, size_t bytesSx,     // 18
+                   float *Sy, size_t bytesSy,     // 20
+                   long m,                        // 22
+                   long n,                        // 23
+                   long block_x,                  // 24
+                   long grid_x                    // 25
+) {
   __visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
   __visc__hint(visc::CPU_TARGET);
-  void* GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing);
-  void* LNode = __visc__createNodeND(0, WrapperlaplacianEstimate);
-  void* CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings);
-  void* CGNode = __visc__createNodeND(0, WrapperComputeGradient);
-  void* CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient);
-  void* RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings);
+  void *GSNode = __visc__createNodeND(0, WrapperGaussianSmoothing);
+  void *LNode = __visc__createNodeND(0, WrapperlaplacianEstimate);
+  void *CZCNode = __visc__createNodeND(0, WrapperComputeZeroCrossings);
+  void *CGNode = __visc__createNodeND(0, WrapperComputeGradient);
+  void *CMGNode = __visc__createNodeND(0, WrapperComputeMaxGradient);
+  void *RZCNode = __visc__createNodeND(0, WrapperRejectZeroCrossings);
 
   // Gaussian Inputs
-  __visc__bindIn(GSNode, 0 , 0, 1); // Bind I
-  __visc__bindIn(GSNode, 1 , 1, 1); // Bind bytesI
+  __visc__bindIn(GSNode, 0, 0, 1);  // Bind I
+  __visc__bindIn(GSNode, 1, 1, 1);  // Bind bytesI
   __visc__bindIn(GSNode, 14, 2, 1); // Bind Gs
   __visc__bindIn(GSNode, 15, 3, 1); // Bind bytesGs
-  __visc__bindIn(GSNode, 2 , 4, 1); // Bind Is
-  __visc__bindIn(GSNode, 3 , 5, 1); // Bind bytesIs
+  __visc__bindIn(GSNode, 2, 4, 1);  // Bind Is
+  __visc__bindIn(GSNode, 3, 5, 1);  // Bind bytesIs
   __visc__bindIn(GSNode, 22, 6, 1); // Bind m
   __visc__bindIn(GSNode, 23, 7, 1); // Bind n
 
   // Laplacian Inputs
-  __visc__bindIn(LNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(LNode, 2, 0, 1);          // Bind Is
   __visc__edge(GSNode, LNode, 1, 0, 1, 1); // Get bytesIs
-  __visc__bindIn(LNode, 16, 2, 1); // Bind B
-  __visc__bindIn(LNode, 17, 3, 1); // Bind bytesB
-  __visc__bindIn(LNode, 4 , 4, 1); // Bind L
-  __visc__bindIn(LNode, 5 , 5, 1); // Bind bytesL
-  __visc__bindIn(LNode, 22, 6, 1); // Bind m
-  __visc__bindIn(LNode, 23, 7, 1); // Bind n
+  __visc__bindIn(LNode, 16, 2, 1);         // Bind B
+  __visc__bindIn(LNode, 17, 3, 1);         // Bind bytesB
+  __visc__bindIn(LNode, 4, 4, 1);          // Bind L
+  __visc__bindIn(LNode, 5, 5, 1);          // Bind bytesL
+  __visc__bindIn(LNode, 22, 6, 1);         // Bind m
+  __visc__bindIn(LNode, 23, 7, 1);         // Bind n
 
   // Compute ZC Inputs
-  __visc__bindIn(CZCNode, 4 , 0, 1); // Bind L
+  __visc__bindIn(CZCNode, 4, 0, 1);         // Bind L
   __visc__edge(LNode, CZCNode, 1, 0, 1, 1); // Get bytesL
-  __visc__bindIn(CZCNode, 16, 2, 1); // Bind B
-  __visc__bindIn(CZCNode, 17, 3, 1); // Bind bytesB
-  __visc__bindIn(CZCNode, 6 , 4, 1); // Bind S
-  __visc__bindIn(CZCNode, 7 , 5, 1); // Bind bytesS
-  __visc__bindIn(CZCNode, 22, 6, 1); // Bind m
-  __visc__bindIn(CZCNode, 23, 7, 1); // Bind n
+  __visc__bindIn(CZCNode, 16, 2, 1);        // Bind B
+  __visc__bindIn(CZCNode, 17, 3, 1);        // Bind bytesB
+  __visc__bindIn(CZCNode, 6, 4, 1);         // Bind S
+  __visc__bindIn(CZCNode, 7, 5, 1);         // Bind bytesS
+  __visc__bindIn(CZCNode, 22, 6, 1);        // Bind m
+  __visc__bindIn(CZCNode, 23, 7, 1);        // Bind n
 
   // Gradient Inputs
-  __visc__bindIn(CGNode, 2 , 0, 1); // Bind Is
+  __visc__bindIn(CGNode, 2, 0, 1);          // Bind Is
   __visc__edge(GSNode, CGNode, 1, 1, 1, 1); // Get bytesIs
-  __visc__bindIn(CGNode, 18, 2, 1); // Bind Sx
-  __visc__bindIn(CGNode, 19, 3, 1); // Bind bytesSx
-  __visc__bindIn(CGNode, 20, 4, 1); // Bind Sy
-  __visc__bindIn(CGNode, 21, 5, 1); // Bind bytesSy
-  __visc__bindIn(CGNode, 8 , 6, 1); // Bind G
-  __visc__bindIn(CGNode, 9 , 7, 1); // Bind bytesG
-  __visc__bindIn(CGNode, 22, 8, 1); // Bind m
-  __visc__bindIn(CGNode, 23, 9, 1); // Bind n
+  __visc__bindIn(CGNode, 18, 2, 1);         // Bind Sx
+  __visc__bindIn(CGNode, 19, 3, 1);         // Bind bytesSx
+  __visc__bindIn(CGNode, 20, 4, 1);         // Bind Sy
+  __visc__bindIn(CGNode, 21, 5, 1);         // Bind bytesSy
+  __visc__bindIn(CGNode, 8, 6, 1);          // Bind G
+  __visc__bindIn(CGNode, 9, 7, 1);          // Bind bytesG
+  __visc__bindIn(CGNode, 22, 8, 1);         // Bind m
+  __visc__bindIn(CGNode, 23, 9, 1);         // Bind n
 
   // Max Gradient Inputs
-  __visc__bindIn(CMGNode, 8 , 0, 1); // Bind G
+  __visc__bindIn(CMGNode, 8, 0, 1);          // Bind G
   __visc__edge(CGNode, CMGNode, 1, 0, 1, 1); // Get bytesG
-  __visc__bindIn(CMGNode, 10, 2, 1); // Bind maxG
-  __visc__bindIn(CMGNode, 11, 3, 1); // Bind bytesMaxG
-  __visc__bindIn(CMGNode, 22, 4, 1);  // Bind m
-  __visc__bindIn(CMGNode, 23, 5, 1); // Bind n
-  __visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x
-  __visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x
+  __visc__bindIn(CMGNode, 10, 2, 1);         // Bind maxG
+  __visc__bindIn(CMGNode, 11, 3, 1);         // Bind bytesMaxG
+  __visc__bindIn(CMGNode, 22, 4, 1);         // Bind m
+  __visc__bindIn(CMGNode, 23, 5, 1);         // Bind n
+  __visc__bindIn(CMGNode, 24, 6, 1);         // Bind block_x
+  __visc__bindIn(CMGNode, 25, 7, 1);         // Bind grid_x
 
   // Reject ZC Inputs
-  __visc__bindIn(RZCNode, 6 , 0, 1); // Bind S
+  __visc__bindIn(RZCNode, 6, 0, 1);           // Bind S
   __visc__edge(CZCNode, RZCNode, 1, 0, 1, 1); // Get bytesS
-  __visc__bindIn(RZCNode, 8 , 2, 1); // Bind G
-  __visc__bindIn(RZCNode, 9 , 3, 1); // Bind bytesG
-  __visc__bindIn(RZCNode, 10, 4, 1); // Bind maxG
+  __visc__bindIn(RZCNode, 8, 2, 1);           // Bind G
+  __visc__bindIn(RZCNode, 9, 3, 1);           // Bind bytesG
+  __visc__bindIn(RZCNode, 10, 4, 1);          // Bind maxG
   __visc__edge(CMGNode, RZCNode, 1, 0, 5, 1); // Get bytesMaxG
-  __visc__bindIn(RZCNode, 12, 6, 1); // Bind E
-  __visc__bindIn(RZCNode, 13, 7, 1); // Bind bytesE
-  __visc__bindIn(RZCNode, 22, 8, 1); // Bind m
-  __visc__bindIn(RZCNode, 23, 9, 1); // Bind n
+  __visc__bindIn(RZCNode, 12, 6, 1);          // Bind E
+  __visc__bindIn(RZCNode, 13, 7, 1);          // Bind bytesE
+  __visc__bindIn(RZCNode, 22, 8, 1);          // Bind m
+  __visc__bindIn(RZCNode, 23, 9, 1);          // Bind n
 
   __visc__bindOut(RZCNode, 0, 0, 1); // Bind output
 }
-
 }
 
 using namespace cv;
 
-void getNextFrame(VideoCapture& VC, Mat& F) {
+void getNextFrame(VideoCapture &VC, Mat &F) {
   VC >> F;
   /// Convert the image to grayscale if image colored
-  if(F.channels() == 3)
-    cvtColor( F, F, CV_BGR2GRAY );
+  if (F.channels() == 3)
+    cvtColor(F, F, CV_BGR2GRAY);
 
-  F.convertTo(F, CV_32F, 1.0/255.0);
+  F.convertTo(F, CV_32F, 1.0 / 255.0);
 }
 
-int main (int argc, char *argv[]) {
+int main(int argc, char *argv[]) {
 
-    if (argc<2) {
-      fprintf(stderr, "Expecting input image filename\n");
-      exit(-1);
-    }
-    char* inFile = argv[1];
-    fprintf(stderr, "Running pipeline on %s\n", inFile);
+  if (argc < 2) {
+    fprintf(stderr, "Expecting input image filename\n");
+    exit(-1);
+  }
+  char *inFile = argv[1];
+  fprintf(stderr, "Running pipeline on %s\n", inFile);
 
-    size_t I_sz;
-    long block_x, grid_x;
+  size_t I_sz;
+  long block_x, grid_x;
 
-    std::cout << "Using OpenCV" << CV_VERSION << "\n";
+  std::cout << "Using OpenCV" << CV_VERSION << "\n";
 
-    /* Read in data */
-    std::cout << "Reading video file: " << inFile << "\n";
-    VideoCapture cap(inFile);
-    if(!cap.isOpened()) {
-      std::cout << "Could not open video file" << "\n";
-      return -1;
-    }
+  /* Read in data */
+  std::cout << "Reading video file: " << inFile << "\n";
+  VideoCapture cap(inFile);
+  if (!cap.isOpened()) {
+    std::cout << "Could not open video file"
+              << "\n";
+    return -1;
+  }
 
-    int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT);
-    NUM_FRAMES = 600;
-    std::cout << "Number of frames = " << NUM_FRAMES << "\n";
+  int NUM_FRAMES = cap.get(CV_CAP_PROP_FRAME_COUNT);
+  NUM_FRAMES = 600;
+  std::cout << "Number of frames = " << NUM_FRAMES << "\n";
 
-    namedWindow(input_window, CV_WINDOW_AUTOSIZE);
-    namedWindow(output_window, CV_WINDOW_AUTOSIZE);
-    moveWindow(input_window, POSX_IN, POSY_IN);
-    moveWindow(output_window, POSX_OUT, POSY_OUT);
+  namedWindow(input_window, CV_WINDOW_AUTOSIZE);
+  namedWindow(output_window, CV_WINDOW_AUTOSIZE);
+  moveWindow(input_window, POSX_IN, POSY_IN);
+  moveWindow(output_window, POSX_OUT, POSY_OUT);
 
-    Mat src, Is, L, S, G, E;
+  Mat src, Is, L, S, G, E;
 
-    getNextFrame(cap, src);
+  getNextFrame(cap, src);
 
-    std::cout << "Image dimension = " << src.size() << "\n";
-    if(!src.isContinuous()) {
-      std::cout << "Expecting contiguous storage of image in memory!\n";
-      exit(-1);
-    }
+  std::cout << "Image dimension = " << src.size() << "\n";
+  if (!src.isContinuous()) {
+    std::cout << "Expecting contiguous storage of image in memory!\n";
+    exit(-1);
+  }
 
-    Is = Mat(src.size[0], src.size[1], CV_32F);
-    L = Mat(src.size[0], src.size[1], CV_32F);
-    S = Mat(src.size[0], src.size[1], CV_32F);
-    G = Mat(src.size[0], src.size[1], CV_32F);
-    E = Mat(src.size[0], src.size[1], CV_32F);
-
-    // All these matrices need to have their data array contiguous in memory
-    assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() && S.isContinuous() && G.isContinuous() && E.isContinuous());
-
-    __visc__init();
-
-    // copy A to device memory
-    I_sz = src.size[0]*src.size[1]*sizeof(float);
-
-    size_t bytesMaxG = sizeof(float);
-    float* maxG = (float*)malloc(bytesMaxG);
-
-    float B[] = { 1, 1, 1,
-                  1, 1, 1,
-                  1, 1, 1 };
-    size_t bytesB = 9*sizeof(float);
-    float Sx[] = {  -1, 0, 1,
-                    -2, 0, 2,
-                    -1, 0, 1  };
-    size_t bytesSx = 9*sizeof(float);
-    float Sy[] = {  -1, -2, -1,
-                     0,  0,  0,
-                     1,  2,  1  };
-    size_t bytesSy = 9*sizeof(float);
-
-    float Gs [] = {
-            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036,
-            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
-            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
-            0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291,
-            0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
-            0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
-            0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036 };
-    size_t bytesGs = 7*7*sizeof(float);
-
-    block_x = 256;
-    // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs
-    grid_x = 1;
-
-    Mat in, out;
-    resize(src, in, Size(HEIGHT, WIDTH));
-    resize(E, out, Size(HEIGHT, WIDTH));
-    imshow(input_window, in);
-    imshow(output_window, out);
-    waitKey(0);
-
-    struct InStruct* args = (struct InStruct*)malloc (sizeof(InStruct));
-    packData(args, (float*)src.data, I_sz,
-                   (float*)Is.data, I_sz,
-                   (float*)L.data, I_sz,
-                   (float*)S.data, I_sz,
-                   (float*)G.data, I_sz,
-                   maxG, bytesMaxG,
-                   (float*)E.data, I_sz,
-                   Gs, bytesGs,
-                   B, bytesB,
-                   Sx, bytesSx,
-                   Sy, bytesSy,
-                   src.size[0], src.size[1],
-                   block_x, grid_x);
-
-    // Check if the total elements is a multiple of block size
-    assert(src.size[0]*src.size[1] % block_x == 0);
-
-    for(unsigned j=0; j<NUM_RUNS; j++) {
-      std::cout << "Run: " << j << "\n";
-      void* DFG = __visc__launch(1, edgeDetection, (void*)args);
-
-      cap = VideoCapture(inFile);
-      getNextFrame(cap, src);
-      
-      if(NUM_FRAMES >=2) {
-        for(int i=0; i<NUM_FRAMES; i++) {
-          std::cout << "Frame " << i << "\n";
-          args->I = (float*) src.data;
-
-          *maxG = 0.0;
- 
-          llvm_visc_track_mem(src.data, I_sz);
-          llvm_visc_track_mem(Is.data, I_sz);
-          llvm_visc_track_mem(L.data, I_sz);
-          llvm_visc_track_mem(S.data, I_sz);
-          llvm_visc_track_mem(G.data, I_sz);
-          llvm_visc_track_mem(maxG, bytesMaxG);
-          llvm_visc_track_mem(E.data, I_sz);
-          llvm_visc_track_mem(Gs, bytesGs);
-          llvm_visc_track_mem(B, bytesB);
-          llvm_visc_track_mem(Sx, bytesSx);
-          llvm_visc_track_mem(Sy, bytesSy);
-
-          __visc__push(DFG, args);
-          void *ret = __visc__pop(DFG);
-          std::cout << "Returned size: " << *(size_t *)ret
-                    << " expected " << I_sz << '\n';
-
-          llvm_visc_request_mem(maxG, bytesMaxG);
-          llvm_visc_request_mem(E.data, I_sz);
-
-          Mat in, out;
-          resize(src, in, Size(HEIGHT, WIDTH));
-          resize(E, out, Size(HEIGHT, WIDTH));
-          imshow(output_window, out);
-          imshow(input_window, in);
-          waitKey(1);
-
-          llvm_visc_untrack_mem(src.data);
-          llvm_visc_untrack_mem(Is.data);
-          llvm_visc_untrack_mem(L.data);
-          llvm_visc_untrack_mem(S.data);
-          llvm_visc_untrack_mem(G.data);
-          llvm_visc_untrack_mem(maxG);
-          llvm_visc_untrack_mem(E.data);
-          llvm_visc_untrack_mem(Gs);
-          llvm_visc_untrack_mem(B);
-          llvm_visc_untrack_mem(Sx);
-          llvm_visc_untrack_mem(Sy);
-
-          getNextFrame(cap, src);
-        }
-      }
-      else {
-          __visc__push(DFG, args);
-          __visc__pop(DFG);
+  Is = Mat(src.size[0], src.size[1], CV_32F);
+  L = Mat(src.size[0], src.size[1], CV_32F);
+  S = Mat(src.size[0], src.size[1], CV_32F);
+  G = Mat(src.size[0], src.size[1], CV_32F);
+  E = Mat(src.size[0], src.size[1], CV_32F);
+
+  // All these matrices need to have their data array contiguous in memory
+  assert(src.isContinuous() && Is.isContinuous() && L.isContinuous() &&
+         S.isContinuous() && G.isContinuous() && E.isContinuous());
+
+  __visc__init();
+
+  // copy A to device memory
+  I_sz = src.size[0] * src.size[1] * sizeof(float);
+
+  size_t bytesMaxG = sizeof(float);
+  float *maxG = (float *)malloc(bytesMaxG);
+
+  float B[] = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  size_t bytesB = 9 * sizeof(float);
+  float Sx[] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
+  size_t bytesSx = 9 * sizeof(float);
+  float Sy[] = {-1, -2, -1, 0, 0, 0, 1, 2, 1};
+  size_t bytesSy = 9 * sizeof(float);
+
+  float Gs[] = {
+      0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036,
+      0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+      0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+      0.002291, 0.023226, 0.092651, 0.146768, 0.092651, 0.023226, 0.002291,
+      0.001446, 0.014662, 0.058488, 0.092651, 0.058488, 0.014662, 0.001446,
+      0.000363, 0.003676, 0.014662, 0.023226, 0.014662, 0.003676, 0.000363,
+      0.000036, 0.000363, 0.001446, 0.002291, 0.001446, 0.000363, 0.000036};
+  size_t bytesGs = 7 * 7 * sizeof(float);
+
+  block_x = 256;
+  // grid_x should be equal to the number of SMs on GPU. FTX 680 has 8 SMs
+  grid_x = 1;
+
+  Mat in, out;
+  resize(src, in, Size(HEIGHT, WIDTH));
+  resize(E, out, Size(HEIGHT, WIDTH));
+  imshow(input_window, in);
+  imshow(output_window, out);
+  waitKey(0);
+
+  struct InStruct *args = (struct InStruct *)malloc(sizeof(InStruct));
+  packData(args, (float *)src.data, I_sz, (float *)Is.data, I_sz,
+           (float *)L.data, I_sz, (float *)S.data, I_sz, (float *)G.data, I_sz,
+           maxG, bytesMaxG, (float *)E.data, I_sz, Gs, bytesGs, B, bytesB, Sx,
+           bytesSx, Sy, bytesSy, src.size[0], src.size[1], block_x, grid_x);
+
+  // Check if the total elements is a multiple of block size
+  assert(src.size[0] * src.size[1] % block_x == 0);
+
+  for (unsigned j = 0; j < NUM_RUNS; j++) {
+    std::cout << "Run: " << j << "\n";
+    void *DFG = __visc__launch(1, edgeDetection, (void *)args);
+
+    cap = VideoCapture(inFile);
+    getNextFrame(cap, src);
+
+    if (NUM_FRAMES >= 2) {
+      for (int i = 0; i < NUM_FRAMES; i++) {
+        std::cout << "Frame " << i << "\n";
+        args->I = (float *)src.data;
+
+        *maxG = 0.0;
+
+        llvm_visc_track_mem(src.data, I_sz);
+        llvm_visc_track_mem(Is.data, I_sz);
+        llvm_visc_track_mem(L.data, I_sz);
+        llvm_visc_track_mem(S.data, I_sz);
+        llvm_visc_track_mem(G.data, I_sz);
+        llvm_visc_track_mem(maxG, bytesMaxG);
+        llvm_visc_track_mem(E.data, I_sz);
+        llvm_visc_track_mem(Gs, bytesGs);
+        llvm_visc_track_mem(B, bytesB);
+        llvm_visc_track_mem(Sx, bytesSx);
+        llvm_visc_track_mem(Sy, bytesSy);
+
+        __visc__push(DFG, args);
+        void *ret = __visc__pop(DFG);
+        std::cout << "Returned size: " << *(size_t *)ret << " expected " << I_sz
+                  << '\n';
+
+        llvm_visc_request_mem(maxG, bytesMaxG);
+        llvm_visc_request_mem(E.data, I_sz);
+
+        Mat in, out;
+        resize(src, in, Size(HEIGHT, WIDTH));
+        resize(E, out, Size(HEIGHT, WIDTH));
+        imshow(output_window, out);
+        imshow(input_window, in);
+        waitKey(1);
+
+        llvm_visc_untrack_mem(src.data);
+        llvm_visc_untrack_mem(Is.data);
+        llvm_visc_untrack_mem(L.data);
+        llvm_visc_untrack_mem(S.data);
+        llvm_visc_untrack_mem(G.data);
+        llvm_visc_untrack_mem(maxG);
+        llvm_visc_untrack_mem(E.data);
+        llvm_visc_untrack_mem(Gs);
+        llvm_visc_untrack_mem(B);
+        llvm_visc_untrack_mem(Sx);
+        llvm_visc_untrack_mem(Sy);
+
+        getNextFrame(cap, src);
       }
-      __visc__wait(DFG);
+    } else {
+      __visc__push(DFG, args);
+      __visc__pop(DFG);
     }
-    __visc__cleanup();
-    return 0;
+    __visc__wait(DFG);
+  }
+  __visc__cleanup();
+  return 0;
 }
diff --git a/hpvm/test/pipeline/src/visc.h b/hpvm/test/pipeline/src/visc.h
index 3a05f49e299a0a63a2251db65762561c25ed3981..917aec5a3773657e63655191b7897b9035b6d378 100644
--- a/hpvm/test/pipeline/src/visc.h
+++ b/hpvm/test/pipeline/src/visc.h
@@ -15,62 +15,62 @@
 #ifdef __cplusplus
 extern "C" {
 void __visc__hint(visc::Target);
-//void __visc__wait(void*);
+// void __visc__wait(void*);
 #else
 void __visc__hint(enum Target);
-//void __visc__wait(unsigned);
+// void __visc__wait(unsigned);
 #endif
 
 #ifdef __cplusplus
-//void* __visc__node(...);
-//void* __visc__createNode(...);
-//void* __visc__createNode1D(...);
-//void* __visc__createNode2D(...);
-//void* __visc__createNode3D(...);
-//void __visc__return(...);
+// void* __visc__node(...);
+// void* __visc__createNode(...);
+// void* __visc__createNode1D(...);
+// void* __visc__createNode2D(...);
+// void* __visc__createNode3D(...);
+// void __visc__return(...);
 #endif
 
-void* __visc__createNodeND(unsigned,...);
+void *__visc__createNodeND(unsigned, ...);
 void __visc__return(unsigned, ...);
 
 void __visc__attributes(unsigned, ...);
 void __visc__init();
 void __visc__cleanup();
 
-void __visc__bindIn(void*, unsigned, unsigned, unsigned);
-void __visc__bindOut(void*, unsigned, unsigned, unsigned);
-void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned);
-void __visc__push(void*, void*);
-void* __visc__pop(void*);
-void* __visc__launch(unsigned, ...);
-void __visc__wait(void*);
+void __visc__bindIn(void *, unsigned, unsigned, unsigned);
+void __visc__bindOut(void *, unsigned, unsigned, unsigned);
+void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, unsigned);
+void __visc__push(void *, void *);
+void *__visc__pop(void *);
+void *__visc__launch(unsigned, ...);
+void __visc__wait(void *);
 
-void* __visc__getNode();
-void* __visc__getParentNode(void*);
+void *__visc__getNode();
+void *__visc__getParentNode(void *);
 void __visc__barrier();
-void* __visc__malloc(long);
-long __visc__getNodeInstanceID_x(void*);
-long __visc__getNodeInstanceID_y(void*);
-long __visc__getNodeInstanceID_z(void*);
-long __visc__getNumNodeInstances_x(void*);
-long __visc__getNumNodeInstances_y(void*);
-long __visc__getNumNodeInstances_z(void*);
+void *__visc__malloc(long);
+long __visc__getNodeInstanceID_x(void *);
+long __visc__getNodeInstanceID_y(void *);
+long __visc__getNodeInstanceID_z(void *);
+long __visc__getNumNodeInstances_x(void *);
+long __visc__getNumNodeInstances_y(void *);
+long __visc__getNumNodeInstances_z(void *);
 
 // Atomic
 // signed int
-int __visc__atomic_cmpxchg(int*, int, int);
-int __visc__atomic_add(int*, int);
-int __visc__atomic_sub(int*, int);
-int __visc__atomic_xchg(int*, int);
-int __visc__atomic_inc(int*);
-int __visc__atomic_dec(int*);
-int __visc__atomic_min(int*, int);
-int __visc__atomic_max(int*, int);
-int __visc__atomic_umax(int*, int);
-int __visc__atomic_umin(int*, int);
-int __visc__atomic_and(int*, int);
-int __visc__atomic_or(int*, int);
-int __visc__atomic_xor(int*, int);
+int __visc__atomic_cmpxchg(int *, int, int);
+int __visc__atomic_add(int *, int);
+int __visc__atomic_sub(int *, int);
+int __visc__atomic_xchg(int *, int);
+int __visc__atomic_inc(int *);
+int __visc__atomic_dec(int *);
+int __visc__atomic_min(int *, int);
+int __visc__atomic_max(int *, int);
+int __visc__atomic_umax(int *, int);
+int __visc__atomic_umin(int *, int);
+int __visc__atomic_and(int *, int);
+int __visc__atomic_or(int *, int);
+int __visc__atomic_xor(int *, int);
 
 // Special Func
 float __visc__floor(float);
@@ -79,18 +79,17 @@ float __visc__sqrt(float);
 float __visc__sin(float);
 float __visc__cos(float);
 // unsigned int
-//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
-//unsigned __visc__atomic_add(unsigned*, unsigned);
-//unsigned __visc__atomic_sub(unsigned*, unsigned);
-//unsigned __visc__atomic_xchg(unsigned*, unsigned);
-//unsigned __visc__atomic_inc(unsigned*);
-//unsigned __visc__atomic_dec(unsigned*);
-//unsigned __visc__atomic_min(unsigned*, unsigned);
-//unsigned __visc__atomic_max(unsigned*, unsigned);
-//unsigned __visc__atomic_and(unsigned*, unsigned);
-//unsigned __visc__atomic_or(unsigned*, unsigned);
-//unsigned __visc__atomic_xor(unsigned*, unsigned);
-
+// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
+// unsigned __visc__atomic_add(unsigned*, unsigned);
+// unsigned __visc__atomic_sub(unsigned*, unsigned);
+// unsigned __visc__atomic_xchg(unsigned*, unsigned);
+// unsigned __visc__atomic_inc(unsigned*);
+// unsigned __visc__atomic_dec(unsigned*);
+// unsigned __visc__atomic_min(unsigned*, unsigned);
+// unsigned __visc__atomic_max(unsigned*, unsigned);
+// unsigned __visc__atomic_and(unsigned*, unsigned);
+// unsigned __visc__atomic_or(unsigned*, unsigned);
+// unsigned __visc__atomic_xor(unsigned*, unsigned);
 
 #include <unistd.h>
 
@@ -99,12 +98,10 @@ long get_group_id(int);
 long get_local_id(int);
 long get_local_size(int);
 
-
-void llvm_visc_track_mem(void*, size_t);
-void llvm_visc_untrack_mem(void*);
-void llvm_visc_request_mem(void*, size_t);
+void llvm_visc_track_mem(void *, size_t);
+void llvm_visc_untrack_mem(void *);
+void llvm_visc_request_mem(void *, size_t);
 
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/hpvm/test/unitTests/CreateNodeAndEdge.c b/hpvm/test/unitTests/CreateNodeAndEdge.c
index f8ba09217de591d4ccc7cd81896d0d865b6d7ba5..1b6b1cff211d5af5a909065af988aadbe979f2ec 100644
--- a/hpvm/test/unitTests/CreateNodeAndEdge.c
+++ b/hpvm/test/unitTests/CreateNodeAndEdge.c
@@ -1,52 +1,50 @@
-#include <stdio.h>
 #include "visc.h"
+#include <stdio.h>
 
 struct Root {
   int *input;
   int *output;
 };
 
-
 void Func1(int *In, int *Out) {
-  __visc__hint (CPU_TARGET);
+  __visc__hint(CPU_TARGET);
   __visc__attributes(1, In, 1, Out);
 
   __visc__return(1, Out);
 }
 
 void Func2(int *BindIn, int *SrcIn, int *Out) {
-  __visc__hint (CPU_TARGET);
+  __visc__hint(CPU_TARGET);
   __visc__attributes(2, BindIn, SrcIn, 1, Out);
 
   __visc__return(1, Out);
 }
 
 void PipeRoot(int *In, int *Out) {
-    __visc__hint (CPU_TARGET);
+  __visc__hint(CPU_TARGET);
 
-    __visc__attributes(1, In, 1, Out);
+  __visc__attributes(1, In, 1, Out);
 
-    void* SrcNode = __visc__createNodeND(0, Func1);
-    void* DestNode = __visc__createNodeND(0, Func2);
+  void *SrcNode = __visc__createNodeND(0, Func1);
+  void *DestNode = __visc__createNodeND(0, Func2);
 
-    __visc__bindIn(SrcNode, 0, 0, 0);
+  __visc__bindIn(SrcNode, 0, 0, 0);
 
-    __visc__bindIn(DestNode, 0, 0, 0);
-   __visc__edge(SrcNode, DestNode, 1, 0, 1, 0);
+  __visc__bindIn(DestNode, 0, 0, 0);
+  __visc__edge(SrcNode, DestNode, 1, 0, 1, 0);
 
-    __visc__bindOut(SrcNode, 0, 0, 0);
+  __visc__bindOut(SrcNode, 0, 0, 0);
 }
 
-int  main(void) {
-    int In = 1;
-    int Out = 0;
-    struct Root RootArgs = {(int *) &In, (int *) &Out};
+int main(void) {
+  int In = 1;
+  int Out = 0;
+  struct Root RootArgs = {(int *)&In, (int *)&Out};
 
-    __visc__init();
-    void* PipeDFG = __visc__launch(0, PipeRoot, (void *) &RootArgs);
-    __visc__wait(PipeDFG);
-    __visc__cleanup();
+  __visc__init();
+  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)&RootArgs);
+  __visc__wait(PipeDFG);
+  __visc__cleanup();
 
-    return 0;
+  return 0;
 }
-
diff --git a/hpvm/test/unitTests/MallocIntrinsic.c b/hpvm/test/unitTests/MallocIntrinsic.c
index fbc3d3ef0802198e71f69cd1cbd2347a413e2a3e..cfd041a991d976c24b372a81b35842598b571d89 100644
--- a/hpvm/test/unitTests/MallocIntrinsic.c
+++ b/hpvm/test/unitTests/MallocIntrinsic.c
@@ -1,5 +1,5 @@
-#include <stdlib.h>
 #include "visc.h"
+#include <stdlib.h>
 
 struct Root {
   int *input;
@@ -7,32 +7,31 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint (CPU_TARGET);
+  __visc__hint(CPU_TARGET);
   __visc__attributes(1, In, 1, Out);
 
- Out = (int *)__visc__malloc(*In);
+  Out = (int *)__visc__malloc(*In);
 
   __visc__return(1, Out);
 }
 
-int  main(void) {
-    int In, Out;
+int main(void) {
+  int In, Out;
 
-   // struct Root RootArgs;
-   // RootArgs.input = (int *)&In;
-   // RootArgs.output = (int *)&Out;
+  // struct Root RootArgs;
+  // RootArgs.input = (int *)&In;
+  // RootArgs.output = (int *)&Out;
 
-     struct Root* RootArgs = (struct Root *) malloc(sizeof(struct Root));
-     RootArgs->input = (int *)&In;
-     RootArgs->output = (int *)&Out;
+  struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root));
+  RootArgs->input = (int *)&In;
+  RootArgs->output = (int *)&Out;
 
-    __visc__init();
+  __visc__init();
 
-    void* PipeDFG = __visc__launch(0, PipeRoot, (void *) RootArgs);
-    __visc__wait(PipeDFG);
+  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
+  __visc__wait(PipeDFG);
 
-    __visc__cleanup();
+  __visc__cleanup();
 
-    return 0;
+  return 0;
 }
-
diff --git a/hpvm/test/unitTests/PipelineIntrinsics.c b/hpvm/test/unitTests/PipelineIntrinsics.c
index 0c1932129266eb55564b199d2451b73c0ce21a73..2a9bf83402891beddf13d96c6346e8fed924d17e 100644
--- a/hpvm/test/unitTests/PipelineIntrinsics.c
+++ b/hpvm/test/unitTests/PipelineIntrinsics.c
@@ -1,5 +1,5 @@
-#include <stdlib.h>
 #include "visc.h"
+#include <stdlib.h>
 
 struct Root {
   int *input;
@@ -7,29 +7,28 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint (CPU_TARGET);
+  __visc__hint(CPU_TARGET);
   __visc__attributes(1, In, 1, Out);
   __visc__return(1, Out);
 }
 
-int  main(void) {
-    int In, Out;
+int main(void) {
+  int In, Out;
 
-   // struct Root RootArgs;
-   // RootArgs.input = (int *)&In;
-   // RootArgs.output = (int *)&Out;
+  // struct Root RootArgs;
+  // RootArgs.input = (int *)&In;
+  // RootArgs.output = (int *)&Out;
 
-     struct Root* RootArgs = (struct Root *) malloc(sizeof(struct Root));
-     RootArgs->input = (int *)&In;
-     RootArgs->output = (int *)&Out;
+  struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root));
+  RootArgs->input = (int *)&In;
+  RootArgs->output = (int *)&Out;
 
-    __visc__init();
+  __visc__init();
 
-    void* PipeDFG = __visc__launch(0, PipeRoot, (void *) RootArgs);
-    __visc__wait(PipeDFG);
+  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
+  __visc__wait(PipeDFG);
 
-    __visc__cleanup();
+  __visc__cleanup();
 
-    return 0;
+  return 0;
 }
-
diff --git a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
index e81e489f4faead6bb099b1a588191df98e737cdc..36fc02d22b066025be4a57695265779d8e55652a 100644
--- a/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
+++ b/hpvm/test/unitTests/PipelineIntrinsics.malloc.c
@@ -1,5 +1,5 @@
-#include <stdlib.h>
 #include "visc.h"
+#include <stdlib.h>
 
 struct Root {
   int *input;
@@ -7,25 +7,24 @@ struct Root {
 };
 
 void PipeRoot(int *In, int *Out) {
-  __visc__hint (CPU_TARGET);
+  __visc__hint(CPU_TARGET);
   __visc__attributes(1, In, 1, Out);
   __visc__return(1, Out);
 }
 
-int  main(void) {
-    int In, Out;
+int main(void) {
+  int In, Out;
 
-    __visc__init();
+  __visc__init();
 
-     struct Root* RootArgs = (struct Root *) malloc(sizeof(struct Root));
-     RootArgs->input = (int *)&In;
-     RootArgs->output = (int *)&Out;
+  struct Root *RootArgs = (struct Root *)malloc(sizeof(struct Root));
+  RootArgs->input = (int *)&In;
+  RootArgs->output = (int *)&Out;
 
-    void* PipeDFG = __visc__launch(0, PipeRoot, (void *) RootArgs);
-    __visc__wait(PipeDFG);
+  void *PipeDFG = __visc__launch(0, PipeRoot, (void *)RootArgs);
+  __visc__wait(PipeDFG);
 
-    __visc__cleanup();
+  __visc__cleanup();
 
-    return 0;
+  return 0;
 }
-
diff --git a/hpvm/test/unitTests/visc.h b/hpvm/test/unitTests/visc.h
index 4faba0d93f16d85272ae4bfcbb3dec1c4b37e140..0b52345b59f7d30e5e00a4dc4102f024444af47c 100644
--- a/hpvm/test/unitTests/visc.h
+++ b/hpvm/test/unitTests/visc.h
@@ -20,54 +20,54 @@ void __visc__hint(enum Target);
 #endif
 
 #ifdef __cplusplus
-void* __visc__node(...);
-//void* __visc__createNode(...);
-//void* __visc__createNode1D(...);
-//void* __visc__createNode2D(...);
-//void* __visc__createNode3D(...);
-//void __visc__return(...);
+void *__visc__node(...);
+// void* __visc__createNode(...);
+// void* __visc__createNode1D(...);
+// void* __visc__createNode2D(...);
+// void* __visc__createNode3D(...);
+// void __visc__return(...);
 #endif
-void* __visc__createNodeND(unsigned, ...);
+void *__visc__createNodeND(unsigned, ...);
 void __visc__return(unsigned, ...);
 
 void __visc__attributes(unsigned, ...);
 void __visc__init();
 void __visc__cleanup();
 
-void __visc__bindIn(void*, unsigned, unsigned, unsigned);
-void __visc__bindOut(void*, unsigned, unsigned, unsigned);
-void* __visc__edge(void*, void*, unsigned, unsigned, unsigned, unsigned);
-void __visc__push(void*, void*);
-void* __visc__pop(void*);
-void* __visc__launch(unsigned, ...);
-void __visc__wait(void*);
+void __visc__bindIn(void *, unsigned, unsigned, unsigned);
+void __visc__bindOut(void *, unsigned, unsigned, unsigned);
+void *__visc__edge(void *, void *, unsigned, unsigned, unsigned, unsigned);
+void __visc__push(void *, void *);
+void *__visc__pop(void *);
+void *__visc__launch(unsigned, ...);
+void __visc__wait(void *);
 
-void* __visc__getNode();
-void* __visc__getParentNode(void*);
+void *__visc__getNode();
+void *__visc__getParentNode(void *);
 void __visc__barrier();
-void* __visc__malloc(long);
-long __visc__getNodeInstanceID_x(void*);
-long __visc__getNodeInstanceID_y(void*);
-long __visc__getNodeInstanceID_z(void*);
-long __visc__getNumNodeInstances_x(void*);
-long __visc__getNumNodeInstances_y(void*);
-long __visc__getNumNodeInstances_z(void*);
+void *__visc__malloc(long);
+long __visc__getNodeInstanceID_x(void *);
+long __visc__getNodeInstanceID_y(void *);
+long __visc__getNodeInstanceID_z(void *);
+long __visc__getNumNodeInstances_x(void *);
+long __visc__getNumNodeInstances_y(void *);
+long __visc__getNumNodeInstances_z(void *);
 
 // Atomic
 // signed int
-int __visc__atomic_cmpxchg(int*, int, int);
-int __visc__atomic_add(int*, int);
-int __visc__atomic_sub(int*, int);
-int __visc__atomic_xchg(int*, int);
-int __visc__atomic_inc(int*);
-int __visc__atomic_dec(int*);
-int __visc__atomic_min(int*, int);
-int __visc__atomic_max(int*, int);
-int __visc__atomic_umax(int*, int);
-int __visc__atomic_umin(int*, int);
-int __visc__atomic_and(int*, int);
-int __visc__atomic_or(int*, int);
-int __visc__atomic_xor(int*, int);
+int __visc__atomic_cmpxchg(int *, int, int);
+int __visc__atomic_add(int *, int);
+int __visc__atomic_sub(int *, int);
+int __visc__atomic_xchg(int *, int);
+int __visc__atomic_inc(int *);
+int __visc__atomic_dec(int *);
+int __visc__atomic_min(int *, int);
+int __visc__atomic_max(int *, int);
+int __visc__atomic_umax(int *, int);
+int __visc__atomic_umin(int *, int);
+int __visc__atomic_and(int *, int);
+int __visc__atomic_or(int *, int);
+int __visc__atomic_xor(int *, int);
 
 // Special Func
 float __visc__floor(float);
@@ -76,18 +76,17 @@ float __visc__sqrt(float);
 float __visc__sin(float);
 float __visc__cos(float);
 // unsigned int
-//unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
-//unsigned __visc__atomic_add(unsigned*, unsigned);
-//unsigned __visc__atomic_sub(unsigned*, unsigned);
-//unsigned __visc__atomic_xchg(unsigned*, unsigned);
-//unsigned __visc__atomic_inc(unsigned*);
-//unsigned __visc__atomic_dec(unsigned*);
-//unsigned __visc__atomic_min(unsigned*, unsigned);
-//unsigned __visc__atomic_max(unsigned*, unsigned);
-//unsigned __visc__atomic_and(unsigned*, unsigned);
-//unsigned __visc__atomic_or(unsigned*, unsigned);
-//unsigned __visc__atomic_xor(unsigned*, unsigned);
-
+// unsigned __visc__atomic_cmpxchg(unsigned*, unsigned, unsigned);
+// unsigned __visc__atomic_add(unsigned*, unsigned);
+// unsigned __visc__atomic_sub(unsigned*, unsigned);
+// unsigned __visc__atomic_xchg(unsigned*, unsigned);
+// unsigned __visc__atomic_inc(unsigned*);
+// unsigned __visc__atomic_dec(unsigned*);
+// unsigned __visc__atomic_min(unsigned*, unsigned);
+// unsigned __visc__atomic_max(unsigned*, unsigned);
+// unsigned __visc__atomic_and(unsigned*, unsigned);
+// unsigned __visc__atomic_or(unsigned*, unsigned);
+// unsigned __visc__atomic_xor(unsigned*, unsigned);
 
 #include <unistd.h>
 
@@ -96,12 +95,10 @@ long get_group_id(int);
 long get_local_id(int);
 long get_local_size(int);
 
-
-void llvm_visc_track_mem(void*, size_t);
-void llvm_visc_untrack_mem(void*);
-void llvm_visc_request_mem(void*, size_t);
+void llvm_visc_track_mem(void *, size_t);
+void llvm_visc_untrack_mem(void *);
+void llvm_visc_request_mem(void *, size_t);
 
 #ifdef __cplusplus
 }
 #endif
-