diff --git a/hpvm/test/MatrixMultiplication/Makefile b/hpvm/test/MatrixMultiplication/Makefile
deleted file mode 100644
index 94f22ed717fdf89427cc36ae80c053ba0a19c790..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-PASSES :=
-
-.PHONY: clean
-
-LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
-LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
-HOST:=gemm_opencl
-KERNELS:=matrixMul
-LLVM_CC:=$(LLVM_INSTALL)/bin/clang
-LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
-
-all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin)
-
-auto_gemm.ll: auto_gemm.c
-	~/current-bin/clang -S -emit-llvm auto_gemm.c -O3 -o auto_gemm.ll
-gen: auto_gemm.ll
-	~/current-src/Release+Asserts/bin/opt -load ~/current-src/Release+Asserts/lib/LLVMGenVISC.so -genvisc auto_gemm.ll -S -o auto_gemm_visc.ll
-$(KERNELS:%=%.ll):%.ll:%.cl
-	$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
-
-$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
-	$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
-
-$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
-	$(LLVM_CC) -O3 -target nvptx $< -S -o $@
-
-$(HOST:%=%.ll):%.ll:%.c
-	$(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@
-
-$(HOST:%=%.bin):%.bin:%.c
-	$(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@
-
-clean :
-	rm -f $(HOST).ll $(KERNELS).ll *.bc *.s nvptx.s* *.bin *.kernels.ll DataflowGraph.dot*
diff --git a/hpvm/test/MatrixMultiplication/gemm.c b/hpvm/test/MatrixMultiplication/gemm.c
deleted file mode 100644
index 7356b8293ddba0c4cd8101649dc10fcd41c2a600..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/gemm.c
+++ /dev/null
@@ -1,168 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-
-#define WA 1024
-#define HA 1024
-#define WB 1024
-#define HB WA
-#define WC WB
-#define HC HA
-
-
-
-// Thread block size
-#define BLOCK_SIZE 16
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Loads a Program file.
-//!
-//! @return the source string if succeeded, 0 otherwise
-//! @param cFilename        program filename
-//! @param szFinalLength    returned length of the code string
-//////////////////////////////////////////////////////////////////////////////
-
-// Check bool
-int isEqual(float a, float b) {
-  return (fabs(a-b) < 0.001);
-}
-
-// Check Results
-
-__attribute__ ((noinline)) int checkResults(float* A, float* B, float* C) {
-  unsigned int size_A = WA * HA;
-  unsigned int size_B = WB * HB;
-  unsigned int size_C = WC * HC;
-  unsigned int bytesC = sizeof(float) * size_C;
-  float* goldC = (float*) malloc(bytesC);
-  for (int i=0; i < HC; i++) {
-    for (int j=0; j < WC; j++) {
-      goldC[i*WC + j] = 0;
-      for (int k=0; k < HB; k++) {
-        goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
-      }
-      if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
-        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
-        return 0;
-      }
-    }
-  }
-  return 1; // Success
-}
-
-
-typedef struct {
-  float* Out;
-  int bytes_Out;
-} rtype;
-
-rtype matrixMul(float* A, int bytes_A, float* B, int bytes_B, float* C, int bytes_C, unsigned k, unsigned n, unsigned m, int idx_x, int idx_y) {
-
-  printf("Entered function\n");
-  int tx = get_global_id(0); //2D Global Thread ID x
-  int ty = get_global_id(1); //2D Global Thread ID y
-  //int tx = get_global_id(0); //2D Global Thread ID x
-  //int ty = get_global_id(1); //2D Global Thread ID y
-
-  printf("Computing element (%d, %d)\n", tx, ty);
-  // Initialize accumulator
-  float res = 0.0f;
-
-  // Perform dot-product of row-column
-  for (int i = 0; i < k; i++) {
-    printf("Accessing k = %d, A[%d], B[%d]\n", k, ty*k+i, i*n+tx);
-    res += A[ty*k+i] * B[i*n+tx];
-  }
-
-  printf("Result computed\n");
-  // Write in device memory
-  C[ty*n+tx] = res;
-
-  printf("Result written to C\n");
-  rtype Output;
-  Output.Out = C;
-  Output.bytes_Out = bytes_C;
-  printf("Output allocated\n");
-  return Output;
-
-}
-
-
-// CPU Computation of MatrixMul
-__attribute__ ((noinline)) rtype computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C, unsigned k, unsigned n, unsigned m ) {
-
-  rtype Out;
-  for(unsigned i=0; i<m; i++) {
-    for(unsigned j=0; j < n; j++) {
-      Out = matrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, k, n, m, i, j);
-    }
-  }
-  return Out;
-
-}
-
-// Main
-int main(int argc, char** argv) {
-
-  // seed for rand()
-  srand(2006);
-
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
-  unsigned int bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
-
-  unsigned int size_B = WB * HB;
-  unsigned int bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
-
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/*
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
-
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
-
-  // Allocate host memory for the result matrix C
-  unsigned int size_C = WC * HC;
-  unsigned int bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
-
-   // Compute using OpenCL
-  rtype Output = computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C, WA, WB, HA);
-
-  if(checkResults(h_A, h_B, Output.Out))
-    printf("\nPass!\n");
-  else
-    printf("\nFailed!\n");
-  printf("\nDone!\n");
-
-  // Deallocate memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
diff --git a/hpvm/test/MatrixMultiplication/visc_gemm.ll b/hpvm/test/MatrixMultiplication/visc_gemm.ll
deleted file mode 100644
index 033b481af786dd936b0fbe383f3723d5faf237c6..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/visc_gemm.ll
+++ /dev/null
@@ -1,416 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
-; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
-; RUN: %t.bin
-; ModuleID = 'gemm_opencl.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
-@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
-@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
-@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
-@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
-@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
-@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
-@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
-@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
-@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
-@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
-@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
-
-; Function Attrs: nounwind uwtable
-define void @randomInit(float* nocapture %data, i32 %size) #0 {
-entry:
-  %cmp3 = icmp sgt i32 %size, 0
-  br i1 %cmp3, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %call = tail call i32 @rand() #5
-  %conv = sitofp i32 %call to float
-  %div = fmul float %conv, 0x3E00000000000000
-  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
-  store float %div, float* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind readnone uwtable
-define i32 @isEqual(float %a, float %b) #2 {
-entry:
-  %sub = fsub float %a, %b
-  %fabsf = tail call float @fabsf(float %sub) #6
-  %0 = fpext float %fabsf to double
-  %cmp = fcmp olt double %0, 1.000000e-03
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; Function Attrs: noinline nounwind uwtable
-define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
-entry:
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry, %for.inc50
-  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
-  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
-  %0 = shl nsw i64 %indvars.iv92, 10
-  br label %for.body7
-
-for.cond4:                                        ; preds = %for.end
-  %inc48 = add nsw i32 %j.079, 1
-  %1 = trunc i64 %indvars.iv.next89 to i32
-  %cmp5 = icmp slt i32 %1, 1024
-  br i1 %cmp5, label %for.body7, label %for.inc50
-
-for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
-  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
-  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
-  %2 = add nsw i64 %indvars.iv88, %0
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body7
-  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
-  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
-  %4 = add nsw i64 %indvars.iv, %0
-  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
-  %5 = load float* %arrayidx16, align 4, !tbaa !0
-  %6 = shl i64 %indvars.iv, 10
-  %7 = add nsw i64 %6, %indvars.iv88
-  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
-  %8 = load float* %arrayidx20, align 4, !tbaa !0
-  %mul21 = fmul float %5, %8
-  %add26 = fadd float %3, %mul21
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body12
-
-for.end:                                          ; preds = %for.body12
-  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
-  %9 = load float* %arrayidx34, align 4, !tbaa !0
-  %sub.i = fsub float %add26, %9
-  %fabsf.i = tail call float @fabsf(float %sub.i) #6
-  %10 = fpext float %fabsf.i to double
-  %cmp.i = fcmp olt double %10, 1.000000e-03
-  %indvars.iv.next89 = add i64 %indvars.iv88, 1
-  br i1 %cmp.i, label %for.cond4, label %if.then
-
-if.then:                                          ; preds = %for.end
-  %conv40 = fpext float %9 to double
-  %conv45 = fpext float %add26 to double
-  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
-  br label %return
-
-for.inc50:                                        ; preds = %for.cond4
-  %indvars.iv.next93 = add i64 %indvars.iv92, 1
-  %inc51 = add nsw i32 %i.081, 1
-  %11 = trunc i64 %indvars.iv.next93 to i32
-  %cmp = icmp slt i32 %11, 1024
-  br i1 %cmp, label %for.cond4.preheader, label %return
-
-return:                                           ; preds = %for.inc50, %if.then
-  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #1
-
-; --------------- VISC Intrinsics ---------------
-; Return Type of VISC Compute Matrix Mul
-%rtype = type {}
-%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }>
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
-; ----------------- VISC intrinsics end ------------------
-
-; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* nocapture in %A, i64 %bytes_A, float* nocapture in %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
-entry:
-  ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
-  
-  ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
-  ; Replaced statement -- 
-  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
-  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
-  %this_node = call i8* @llvm.visc.getNode()
-  %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
-  %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
-  ; ---------------------- VISC changes End ------------------
-
-  ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
-  %cmp44 = icmp eq i32 %k, 0
-  br i1 %cmp44, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %mul = mul i32 %call2, %k
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
-  %0 = trunc i64 %indvars.iv to i32
-  %add = add i32 %0, %mul
-  %mul4 = mul i32 %0, %n
-  %add5 = add i32 %mul4, %call1
-  ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  %1 = load float* %arrayidx, align 4, !tbaa !0
-  %idxprom11 = zext i32 %add5 to i64
-  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
-  %2 = load float* %arrayidx12, align 4, !tbaa !0
-  %mul13 = fmul float %1, %2
-  %add14 = fadd float %res.046, %mul13
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
-  ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
-  %mul16 = mul i32 %call2, %n
-  %add17 = add i32 %mul16, %call1
-  %idxprom18 = zext i32 %add17 to i64
-  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
-  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
-  ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
-  ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
-  ret %rtype undef 
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
-;entry:
-;  %cmp18 = icmp eq i32 %m, 0
-;  %cmp215 = icmp eq i32 %n, 0
-;  %or.cond = or i1 %cmp18, %cmp215
-;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.inc4.us:                                      ; preds = %for.body3.us
-;  %0 = extractvalue %rtype %call.us, 0
-;  %1 = extractvalue %rtype %call.us, 1
-;  %inc5.us = add i32 %i.019.us, 1
-;  %exitcond24 = icmp eq i32 %inc5.us, %m
-;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
-;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
-;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
-;  %inc.us = add i32 %j.016.us, 1
-;  %exitcond = icmp eq i32 %inc.us, %n
-;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
-;
-;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
-;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
-;  br label %for.body3.us
-;
-;for.end6:                                         ; preds = %for.inc4.us, %entry
-;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
-;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
-;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
-;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
-;  ret %rtype %.fca.1.insert
-;}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-entry:
-  tail call void @srand(i32 2006) #5
-  %call = tail call noalias i8* @malloc(i64 4194304) #5
-  %0 = bitcast i8* %call to float*
-  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i, %entry
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
-  %call.i = tail call i32 @rand() #5
-  %conv.i = sitofp i32 %call.i to float
-  %div.i = fmul float %conv.i, 0x3E00000000000000
-  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
-  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
-  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
-
-for.body.i40.preheader:                           ; preds = %for.body.i
-  %1 = bitcast i8* %call7 to float*
-  br label %for.body.i40
-
-for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
-  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
-  %call.i33 = tail call i32 @rand() #5
-  %conv.i34 = sitofp i32 %call.i33 to float
-  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
-  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
-  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
-  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
-  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
-
-randomInit.exit41:                                ; preds = %for.body.i40
-  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
-  %2 = bitcast i8* %call12 to float*
-
-  ; ---------------------------------- Adding VISC Launch Call --------------------------------
-  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
-  ; Setting up launch input args
-  call void @llvm.visc.init()
-  %in.addr = alloca %struct.arg
-
-  ; Store arguments
-  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-
-  store float* %0, float** %in.addr.h_A
-  store i64 4194304, i64* %in.addr.bytes_A
-  store float* %1, float** %in.addr.h_B
-  store i64 4194304, i64* %in.addr.bytes_B
-  store float* %2, float** %in.addr.h_C
-  store i64 4194304, i64* %in.addr.bytes_C
-  store i32 1024, i32* %in.addr.WA
-  store i32 1024, i32* %in.addr.WB
-  store i32 1024, i32* %in.addr.HA
-
-  ; Change type to i8* and VISC Launch call
-  %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-
-  ; Wait for result
-  call void @llvm.visc.wait(i8* %graphID)
-
-  ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  %out = load %rtype* %out.addr
-  call void @llvm.visc.cleanup()
-  ; -------------------------------- Completed VISC Launch Call --------------------------------
-
-  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
-  %tobool = icmp eq i32 %call14, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %randomInit.exit41
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
-  br label %if.end
-
-if.else:                                          ; preds = %randomInit.exit41
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
-  tail call void @free(i8* %call) #5
-  tail call void @free(i8* %call7) #5
-  tail call void @free(i8* %call12) #5
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @srand(i32) #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-declare float @fabsf(float)
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #5
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
-attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level.ll
deleted file mode 100644
index ed3e3bf0985c24ac5785137be91e67ac298093b4..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level.ll
+++ /dev/null
@@ -1,458 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
-; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
-; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
-; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
-; RUN: %t.bin
-; ModuleID = 'gemm_opencl.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
-@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
-@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1
-@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
-@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
-@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
-@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
-@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
-@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
-@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
-@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
-@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
-@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
-
-; Function Attrs: nounwind uwtable
-define void @randomInit(float* nocapture %data, i32 %size) #0 {
-entry:
-  %cmp3 = icmp sgt i32 %size, 0
-  br i1 %cmp3, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %call = tail call i32 @rand() #5
-  %conv = sitofp i32 %call to float
-  %div = fmul float %conv, 0x3E00000000000000
-  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
-  store float %div, float* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind readnone uwtable
-define i32 @isEqual(float %a, float %b) #2 {
-entry:
-  %sub = fsub float %a, %b
-  %fabsf = tail call float @fabsf(float %sub) #6
-  %0 = fpext float %fabsf to double
-  %cmp = fcmp olt double %0, 1.000000e-03
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; Function Attrs: noinline nounwind uwtable
-define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
-entry:
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry, %for.inc50
-  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
-  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
-  %0 = shl nsw i64 %indvars.iv92, 10
-  br label %for.body7
-
-for.cond4:                                        ; preds = %for.end
-  %inc48 = add nsw i32 %j.079, 1
-  %1 = trunc i64 %indvars.iv.next89 to i32
-  %cmp5 = icmp slt i32 %1, 1024
-  br i1 %cmp5, label %for.body7, label %for.inc50
-
-for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
-  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
-  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
-  %2 = add nsw i64 %indvars.iv88, %0
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body7
-  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
-  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
-  %4 = add nsw i64 %indvars.iv, %0
-  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
-  %5 = load float* %arrayidx16, align 4, !tbaa !0
-  %6 = shl i64 %indvars.iv, 10
-  %7 = add nsw i64 %6, %indvars.iv88
-  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
-  %8 = load float* %arrayidx20, align 4, !tbaa !0
-  %mul21 = fmul float %5, %8
-  %add26 = fadd float %3, %mul21
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body12
-
-for.end:                                          ; preds = %for.body12
-  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
-  %9 = load float* %arrayidx34, align 4, !tbaa !0
-  %sub.i = fsub float %add26, %9
-  %fabsf.i = tail call float @fabsf(float %sub.i) #6
-  %10 = fpext float %fabsf.i to double
-  %cmp.i = fcmp olt double %10, 1.000000e-03
-  %indvars.iv.next89 = add i64 %indvars.iv88, 1
-  br i1 %cmp.i, label %for.cond4, label %if.then
-
-if.then:                                          ; preds = %for.end
-  %conv40 = fpext float %9 to double
-  %conv45 = fpext float %add26 to double
-  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
-  br label %return
-
-for.inc50:                                        ; preds = %for.cond4
-  %indvars.iv.next93 = add i64 %indvars.iv92, 1
-  %inc51 = add nsw i32 %i.081, 1
-  %11 = trunc i64 %indvars.iv.next93 to i32
-  %cmp = icmp slt i32 %11, 1024
-  br i1 %cmp, label %for.cond4.preheader, label %return
-
-return:                                           ; preds = %for.inc50, %if.then
-  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #1
-
-; --------------- VISC Intrinsics ---------------
-; Return Type of VISC Compute Matrix Mul
-%rtype = type {}
-%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }>
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
-; ----------------- VISC intrinsics end ------------------
-
-; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
-entry:
-  ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
-  
-  ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
-  ; Replaced statement -- 
-  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
-  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
-  %this_node = call i8* @llvm.visc.getNode()
-  %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
-  %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
-  %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
-  %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
-
-  %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node)
-  %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node)
-  %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node)
-
-  %tmpx = mul i32 %Gx, %LLimitx
-  %tmpy = mul i32 %Gy, %LLimity
-
-  %call1 = add i32 %tmpx, %Lx
-  %call2 = add i32 %tmpy, %Ly
-
-  ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5
-  ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5
-
-  ; ---------------------- VISC changes End ------------------
-
-  ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
-  %cmp44 = icmp eq i32 %k, 0
-  br i1 %cmp44, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %mul = mul i32 %call2, %k
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
-  %0 = trunc i64 %indvars.iv to i32
-  %add = add i32 %0, %mul
-  %mul4 = mul i32 %0, %n
-  %add5 = add i32 %mul4, %call1
-  ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  %1 = load float* %arrayidx, align 4, !tbaa !0
-  %idxprom11 = zext i32 %add5 to i64
-  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
-  %2 = load float* %arrayidx12, align 4, !tbaa !0
-  %mul13 = fmul float %1, %2
-  %add14 = fadd float %res.046, %mul13
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
-  ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
-  %mul16 = mul i32 %call2, %n
-  %add17 = add i32 %mul16, %call1
-  %idxprom18 = zext i32 %add17 to i64
-  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
-  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
-  ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
-  ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 16, i32 16)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulInternal to i8*),i32 64, i32 64)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
-;entry:
-;  %cmp18 = icmp eq i32 %m, 0
-;  %cmp215 = icmp eq i32 %n, 0
-;  %or.cond = or i1 %cmp18, %cmp215
-;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.inc4.us:                                      ; preds = %for.body3.us
-;  %0 = extractvalue %rtype %call.us, 0
-;  %1 = extractvalue %rtype %call.us, 1
-;  %inc5.us = add i32 %i.019.us, 1
-;  %exitcond24 = icmp eq i32 %inc5.us, %m
-;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
-;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
-;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
-;  %inc.us = add i32 %j.016.us, 1
-;  %exitcond = icmp eq i32 %inc.us, %n
-;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
-;
-;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
-;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
-;  br label %for.body3.us
-;
-;for.end6:                                         ; preds = %for.inc4.us, %entry
-;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
-;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
-;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
-;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
-;  ret %rtype %.fca.1.insert
-;}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-entry:
-  tail call void @srand(i32 2006) #5
-  %call = tail call noalias i8* @malloc(i64 4194304) #5
-  %0 = bitcast i8* %call to float*
-  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i, %entry
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
-  %call.i = tail call i32 @rand() #5
-  %conv.i = sitofp i32 %call.i to float
-  %div.i = fmul float %conv.i, 0x3E00000000000000
-  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
-  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
-  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
-
-for.body.i40.preheader:                           ; preds = %for.body.i
-  %1 = bitcast i8* %call7 to float*
-  br label %for.body.i40
-
-for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
-  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
-  %call.i33 = tail call i32 @rand() #5
-  %conv.i34 = sitofp i32 %call.i33 to float
-  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
-  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
-  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
-  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
-  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
-
-randomInit.exit41:                                ; preds = %for.body.i40
-  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
-  %2 = bitcast i8* %call12 to float*
-
-  ; ---------------------------------- Adding VISC Launch Call --------------------------------
-  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
-  ; Setting up launch input args
-  call void @llvm.visc.init()
-  %in.addr = alloca %struct.arg
-
-  ; Store arguments
-  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-
-  store float* %0, float** %in.addr.h_A
-  store i64 4194304, i64* %in.addr.bytes_A
-  store float* %1, float** %in.addr.h_B
-  store i64 4194304, i64* %in.addr.bytes_B
-  store float* %2, float** %in.addr.h_C
-  store i64 4194304, i64* %in.addr.bytes_C
-  store i32 1024, i32* %in.addr.WA
-  store i32 1024, i32* %in.addr.WB
-  store i32 1024, i32* %in.addr.HA
-
-  ; Change type to i8* and VISC Launch call
-  %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-
-  ; Wait for result
-  call void @llvm.visc.wait(i8* %graphID)
-
-  ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  %out = load %rtype* %out.addr
-  call void @llvm.visc.cleanup()
-  ; -------------------------------- Completed VISC Launch Call --------------------------------
-
-  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
-  %tobool = icmp eq i32 %call14, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %randomInit.exit41
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
-  br label %if.end
-
-if.else:                                          ; preds = %randomInit.exit41
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
-  tail call void @free(i8* %call) #5
-  tail call void @free(i8* %call7) #5
-  tail call void @free(i8* %call12) #5
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @srand(i32) #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-declare float @fabsf(float)
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #5
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
-attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_host.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level_host.ll
deleted file mode 100644
index fc3db521db174e58626a5c4daf109061530bb250..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_host.ll
+++ /dev/null
@@ -1,456 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
-; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
-; RUN: %t.bin
-; ModuleID = 'gemm_opencl.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
-@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
-@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1
-@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
-@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
-@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
-@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
-@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
-@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
-@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
-@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
-@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
-@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
-
-; Function Attrs: nounwind uwtable
-define void @randomInit(float* nocapture %data, i32 %size) #0 {
-entry:
-  %cmp3 = icmp sgt i32 %size, 0
-  br i1 %cmp3, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %call = tail call i32 @rand() #5
-  %conv = sitofp i32 %call to float
-  %div = fmul float %conv, 0x3E00000000000000
-  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
-  store float %div, float* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind readnone uwtable
-define i32 @isEqual(float %a, float %b) #2 {
-entry:
-  %sub = fsub float %a, %b
-  %fabsf = tail call float @fabsf(float %sub) #6
-  %0 = fpext float %fabsf to double
-  %cmp = fcmp olt double %0, 1.000000e-03
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; Function Attrs: noinline nounwind uwtable
-define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
-entry:
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry, %for.inc50
-  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
-  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
-  %0 = shl nsw i64 %indvars.iv92, 10
-  br label %for.body7
-
-for.cond4:                                        ; preds = %for.end
-  %inc48 = add nsw i32 %j.079, 1
-  %1 = trunc i64 %indvars.iv.next89 to i32
-  %cmp5 = icmp slt i32 %1, 1024
-  br i1 %cmp5, label %for.body7, label %for.inc50
-
-for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
-  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
-  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
-  %2 = add nsw i64 %indvars.iv88, %0
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body7
-  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
-  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
-  %4 = add nsw i64 %indvars.iv, %0
-  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
-  %5 = load float* %arrayidx16, align 4, !tbaa !0
-  %6 = shl i64 %indvars.iv, 10
-  %7 = add nsw i64 %6, %indvars.iv88
-  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
-  %8 = load float* %arrayidx20, align 4, !tbaa !0
-  %mul21 = fmul float %5, %8
-  %add26 = fadd float %3, %mul21
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body12
-
-for.end:                                          ; preds = %for.body12
-  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
-  %9 = load float* %arrayidx34, align 4, !tbaa !0
-  %sub.i = fsub float %add26, %9
-  %fabsf.i = tail call float @fabsf(float %sub.i) #6
-  %10 = fpext float %fabsf.i to double
-  %cmp.i = fcmp olt double %10, 1.000000e-03
-  %indvars.iv.next89 = add i64 %indvars.iv88, 1
-  br i1 %cmp.i, label %for.cond4, label %if.then
-
-if.then:                                          ; preds = %for.end
-  %conv40 = fpext float %9 to double
-  %conv45 = fpext float %add26 to double
-  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
-  br label %return
-
-for.inc50:                                        ; preds = %for.cond4
-  %indvars.iv.next93 = add i64 %indvars.iv92, 1
-  %inc51 = add nsw i32 %i.081, 1
-  %11 = trunc i64 %indvars.iv.next93 to i32
-  %cmp = icmp slt i32 %11, 1024
-  br i1 %cmp, label %for.cond4.preheader, label %return
-
-return:                                           ; preds = %for.inc50, %if.then
-  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #1
-
-; --------------- VISC Intrinsics ---------------
-; Return Type of VISC Compute Matrix Mul
-%rtype = type {}
-%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }>
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
-; ----------------- VISC intrinsics end ------------------
-
-; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
-entry:
-  ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
-  
-  ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
-  ; Replaced statement -- 
-  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
-  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
-  %this_node = call i8* @llvm.visc.getNode()
-  %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
-  %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
-  %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
-  %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
-
-  %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node)
-  %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node)
-  %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node)
-
-  %tmpx = mul i32 %Gx, %LLimitx
-  %tmpy = mul i32 %Gy, %LLimity
-
-  %call1 = add i32 %tmpx, %Lx
-  %call2 = add i32 %tmpy, %Ly
-
-  ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5
-  ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5
-
-  ; ---------------------- VISC changes End ------------------
-
-  ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
-  %cmp44 = icmp eq i32 %k, 0
-  br i1 %cmp44, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %mul = mul i32 %call2, %k
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
-  %0 = trunc i64 %indvars.iv to i32
-  %add = add i32 %0, %mul
-  %mul4 = mul i32 %0, %n
-  %add5 = add i32 %mul4, %call1
-  ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  %1 = load float* %arrayidx, align 4, !tbaa !0
-  %idxprom11 = zext i32 %add5 to i64
-  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
-  %2 = load float* %arrayidx12, align 4, !tbaa !0
-  %mul13 = fmul float %1, %2
-  %add14 = fadd float %res.046, %mul13
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
-  ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
-  %mul16 = mul i32 %call2, %n
-  %add17 = add i32 %mul16, %call1
-  %idxprom18 = zext i32 %add17 to i64
-  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
-  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
-  ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
-  ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 16, i32 16)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulInternal to i8*),i32 64, i32 64)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
-;entry:
-;  %cmp18 = icmp eq i32 %m, 0
-;  %cmp215 = icmp eq i32 %n, 0
-;  %or.cond = or i1 %cmp18, %cmp215
-;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.inc4.us:                                      ; preds = %for.body3.us
-;  %0 = extractvalue %rtype %call.us, 0
-;  %1 = extractvalue %rtype %call.us, 1
-;  %inc5.us = add i32 %i.019.us, 1
-;  %exitcond24 = icmp eq i32 %inc5.us, %m
-;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
-;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
-;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
-;  %inc.us = add i32 %j.016.us, 1
-;  %exitcond = icmp eq i32 %inc.us, %n
-;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
-;
-;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
-;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
-;  br label %for.body3.us
-;
-;for.end6:                                         ; preds = %for.inc4.us, %entry
-;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
-;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
-;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
-;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
-;  ret %rtype %.fca.1.insert
-;}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-entry:
-  tail call void @srand(i32 2006) #5
-  %call = tail call noalias i8* @malloc(i64 4194304) #5
-  %0 = bitcast i8* %call to float*
-  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i, %entry
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
-  %call.i = tail call i32 @rand() #5
-  %conv.i = sitofp i32 %call.i to float
-  %div.i = fmul float %conv.i, 0x3E00000000000000
-  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
-  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
-  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
-
-for.body.i40.preheader:                           ; preds = %for.body.i
-  %1 = bitcast i8* %call7 to float*
-  br label %for.body.i40
-
-for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
-  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
-  %call.i33 = tail call i32 @rand() #5
-  %conv.i34 = sitofp i32 %call.i33 to float
-  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
-  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
-  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
-  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
-  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
-
-randomInit.exit41:                                ; preds = %for.body.i40
-  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
-  %2 = bitcast i8* %call12 to float*
-
-  ; ---------------------------------- Adding VISC Launch Call --------------------------------
-  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
-  ; Setting up launch input args
-  call void @llvm.visc.init()
-  %in.addr = alloca %struct.arg
-
-  ; Store arguments
-  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-
-  store float* %0, float** %in.addr.h_A
-  store i64 4194304, i64* %in.addr.bytes_A
-  store float* %1, float** %in.addr.h_B
-  store i64 4194304, i64* %in.addr.bytes_B
-  store float* %2, float** %in.addr.h_C
-  store i64 4194304, i64* %in.addr.bytes_C
-  store i32 1024, i32* %in.addr.WA
-  store i32 1024, i32* %in.addr.WB
-  store i32 1024, i32* %in.addr.HA
-
-  ; Change type to i8* and VISC Launch call
-  %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-
-  ; Wait for result
-  call void @llvm.visc.wait(i8* %graphID)
-
-  ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  %out = load %rtype* %out.addr
-  call void @llvm.visc.cleanup()
-  ; -------------------------------- Completed VISC Launch Call --------------------------------
-
-  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
-  %tobool = icmp eq i32 %call14, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %randomInit.exit41
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
-  br label %if.end
-
-if.else:                                          ; preds = %randomInit.exit41
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
-  tail call void @free(i8* %call) #5
-  tail call void @free(i8* %call7) #5
-  tail call void @free(i8* %call12) #5
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @srand(i32) #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-declare float @fabsf(float)
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #5
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
-attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_outedge.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level_outedge.ll
deleted file mode 100644
index b51727e2a674de58cb83cbba2baf024b7aaebfab..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_outedge.ll
+++ /dev/null
@@ -1,472 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
-; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
-; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
-; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
-; RUN: %t.bin
-; ModuleID = 'gemm_opencl.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
-@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
-@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1
-@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
-@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
-@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
-@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
-@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
-@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
-@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
-@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
-@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
-@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
-
-; Function Attrs: nounwind uwtable
-define void @randomInit(float* nocapture %data, i32 %size) #0 {
-entry:
-  %cmp3 = icmp sgt i32 %size, 0
-  br i1 %cmp3, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %call = tail call i32 @rand() #5
-  %conv = sitofp i32 %call to float
-  %div = fmul float %conv, 0x3E00000000000000
-  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
-  store float %div, float* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind readnone uwtable
-define i32 @isEqual(float %a, float %b) #2 {
-entry:
-  %sub = fsub float %a, %b
-  %fabsf = tail call float @fabsf(float %sub) #6
-  %0 = fpext float %fabsf to double
-  %cmp = fcmp olt double %0, 1.000000e-03
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; Function Attrs: noinline nounwind uwtable
-define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
-entry:
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry, %for.inc50
-  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
-  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
-  %0 = shl nsw i64 %indvars.iv92, 10
-  br label %for.body7
-
-for.cond4:                                        ; preds = %for.end
-  %inc48 = add nsw i32 %j.079, 1
-  %1 = trunc i64 %indvars.iv.next89 to i32
-  %cmp5 = icmp slt i32 %1, 1024
-  br i1 %cmp5, label %for.body7, label %for.inc50
-
-for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
-  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
-  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
-  %2 = add nsw i64 %indvars.iv88, %0
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body7
-  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
-  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
-  %4 = add nsw i64 %indvars.iv, %0
-  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
-  %5 = load float* %arrayidx16, align 4, !tbaa !0
-  %6 = shl i64 %indvars.iv, 10
-  %7 = add nsw i64 %6, %indvars.iv88
-  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
-  %8 = load float* %arrayidx20, align 4, !tbaa !0
-  %mul21 = fmul float %5, %8
-  %add26 = fadd float %3, %mul21
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body12
-
-for.end:                                          ; preds = %for.body12
-  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
-  %9 = load float* %arrayidx34, align 4, !tbaa !0
-  %sub.i = fsub float %add26, %9
-  %fabsf.i = tail call float @fabsf(float %sub.i) #6
-  %10 = fpext float %fabsf.i to double
-  %cmp.i = fcmp olt double %10, 1.000000e-03
-  %indvars.iv.next89 = add i64 %indvars.iv88, 1
-  br i1 %cmp.i, label %for.cond4, label %if.then
-
-if.then:                                          ; preds = %for.end
-  %conv40 = fpext float %9 to double
-  %conv45 = fpext float %add26 to double
-  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
-  br label %return
-
-for.inc50:                                        ; preds = %for.cond4
-  %indvars.iv.next93 = add i64 %indvars.iv92, 1
-  %inc51 = add nsw i32 %i.081, 1
-  %11 = trunc i64 %indvars.iv.next93 to i32
-  %cmp = icmp slt i32 %11, 1024
-  br i1 %cmp, label %for.cond4.preheader, label %return
-
-return:                                           ; preds = %for.inc50, %if.then
-  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #1
-
-; --------------- VISC Intrinsics ---------------
-; Return Type of VISC Compute Matrix Mul
-%rtype = type <{ i32, i32 }>
-
-%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }>
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
-; ----------------- VISC intrinsics end ------------------
-
-; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
-entry:
-  ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
-
-  ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
-  ; Replaced statement --
-  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
-  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
-  %this_node = call i8* @llvm.visc.getNode()
-  %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
-  %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
-  %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
-  %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
-
-  %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node)
-  %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node)
-  %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node)
-
-  %tmpx = mul i32 %Gx, %LLimitx
-  %tmpy = mul i32 %Gy, %LLimity
-
-  %call1 = add i32 %tmpx, %Lx
-  %call2 = add i32 %tmpy, %Ly
-
-  ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5
-  ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5
-
-  ; ---------------------- VISC changes End ------------------
-
-  ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
-  %cmp44 = icmp eq i32 %k, 0
-  br i1 %cmp44, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %mul = mul i32 %call2, %k
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
-  %0 = trunc i64 %indvars.iv to i32
-  %add = add i32 %0, %mul
-  %mul4 = mul i32 %0, %n
-  %add5 = add i32 %mul4, %call1
-  ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  %1 = load float* %arrayidx, align 4, !tbaa !0
-  %idxprom11 = zext i32 %add5 to i64
-  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
-  %2 = load float* %arrayidx12, align 4, !tbaa !0
-  %mul13 = fmul float %1, %2
-  %add14 = fadd float %res.046, %mul13
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
-  ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
-  %mul16 = mul i32 %call2, %n
-  %add17 = add i32 %mul16, %call1
-  %idxprom18 = zext i32 %add17 to i64
-  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
-  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
-  ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
-  ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
-
-  ; Generating bogus output to test correct output mapping
-  %tmp1 = insertvalue %rtype undef, i32 0, 0
-  %tmp2 = insertvalue %rtype %tmp1, i32 1, 1
-  ret %rtype %tmp2
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 16, i32 16)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 1); 1
-  call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 0); 0
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulInternal to i8*),i32 64, i32 64)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); 0
-  call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); 1
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
-;entry:
-;  %cmp18 = icmp eq i32 %m, 0
-;  %cmp215 = icmp eq i32 %n, 0
-;  %or.cond = or i1 %cmp18, %cmp215
-;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.inc4.us:                                      ; preds = %for.body3.us
-;  %0 = extractvalue %rtype %call.us, 0
-;  %1 = extractvalue %rtype %call.us, 1
-;  %inc5.us = add i32 %i.019.us, 1
-;  %exitcond24 = icmp eq i32 %inc5.us, %m
-;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
-;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
-;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
-;  %inc.us = add i32 %j.016.us, 1
-;  %exitcond = icmp eq i32 %inc.us, %n
-;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
-;
-;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
-;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
-;  br label %for.body3.us
-;
-;for.end6:                                         ; preds = %for.inc4.us, %entry
-;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
-;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
-;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
-;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
-;  ret %rtype %.fca.1.insert
-;}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-entry:
-  tail call void @srand(i32 2006) #5
-  %call = tail call noalias i8* @malloc(i64 4194304) #5
-  %0 = bitcast i8* %call to float*
-  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i, %entry
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
-  %call.i = tail call i32 @rand() #5
-  %conv.i = sitofp i32 %call.i to float
-  %div.i = fmul float %conv.i, 0x3E00000000000000
-  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
-  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
-  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
-
-for.body.i40.preheader:                           ; preds = %for.body.i
-  %1 = bitcast i8* %call7 to float*
-  br label %for.body.i40
-
-for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
-  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
-  %call.i33 = tail call i32 @rand() #5
-  %conv.i34 = sitofp i32 %call.i33 to float
-  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
-  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
-  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
-  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
-  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
-
-randomInit.exit41:                                ; preds = %for.body.i40
-  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
-  %2 = bitcast i8* %call12 to float*
-
-  ; ---------------------------------- Adding VISC Launch Call --------------------------------
-  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
-  ; Setting up launch input args
-  call void @llvm.visc.init()
-  %in.addr = alloca %struct.arg
-
-  ; Store arguments
-  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-
-  store float* %0, float** %in.addr.h_A
-  store i64 4194304, i64* %in.addr.bytes_A
-  store float* %1, float** %in.addr.h_B
-  store i64 4194304, i64* %in.addr.bytes_B
-  store float* %2, float** %in.addr.h_C
-  store i64 4194304, i64* %in.addr.bytes_C
-  store i32 1024, i32* %in.addr.WA
-  store i32 1024, i32* %in.addr.WB
-  store i32 1024, i32* %in.addr.HA
-
-  ; Change type to i8* and VISC Launch call
-  %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-
-  ; Wait for result
-  call void @llvm.visc.wait(i8* %graphID)
-
-  ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  %out = load %rtype* %out.addr
-  %output_0 = extractvalue %rtype %out, 0
-  %output_1 = extractvalue %rtype %out, 1
-  
-  %printcall0 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %output_0) #5
-  %printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %output_1) #5
-  call void @llvm.visc.cleanup()
-  ; -------------------------------- Completed VISC Launch Call --------------------------------
-
-  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
-  %tobool = icmp eq i32 %call14, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %randomInit.exit41
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
-  br label %if.end
-
-if.else:                                          ; preds = %randomInit.exit41
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
-  tail call void @free(i8* %call) #5
-  tail call void @free(i8* %call7) #5
-  tail call void @free(i8* %call12) #5
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @srand(i32) #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-declare float @fabsf(float)
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #5
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
-attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_param.ll b/hpvm/test/MatrixMultiplication/visc_gemm_2_level_param.ll
deleted file mode 100644
index 30ccd8cc4c33e9287a0673c9ce537e6ccfb24b21..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/visc_gemm_2_level_param.ll
+++ /dev/null
@@ -1,463 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
-; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
-; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
-; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
-; RUN: %t.bin
-; ModuleID = 'gemm_opencl.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
-@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
-@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1
-@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
-@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
-@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
-@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
-@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
-@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
-@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
-@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
-@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
-@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
-
-; Function Attrs: nounwind uwtable
-define void @randomInit(float* nocapture %data, i32 %size) #0 {
-entry:
-  %cmp3 = icmp sgt i32 %size, 0
-  br i1 %cmp3, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %call = tail call i32 @rand() #5
-  %conv = sitofp i32 %call to float
-  %div = fmul float %conv, 0x3E00000000000000
-  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
-  store float %div, float* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind readnone uwtable
-define i32 @isEqual(float %a, float %b) #2 {
-entry:
-  %sub = fsub float %a, %b
-  %fabsf = tail call float @fabsf(float %sub) #6
-  %0 = fpext float %fabsf to double
-  %cmp = fcmp olt double %0, 1.000000e-03
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; Function Attrs: noinline nounwind uwtable
-define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
-entry:
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry, %for.inc50
-  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
-  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
-  %0 = shl nsw i64 %indvars.iv92, 10
-  br label %for.body7
-
-for.cond4:                                        ; preds = %for.end
-  %inc48 = add nsw i32 %j.079, 1
-  %1 = trunc i64 %indvars.iv.next89 to i32
-  %cmp5 = icmp slt i32 %1, 1024
-  br i1 %cmp5, label %for.body7, label %for.inc50
-
-for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
-  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
-  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
-  %2 = add nsw i64 %indvars.iv88, %0
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body7
-  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
-  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
-  %4 = add nsw i64 %indvars.iv, %0
-  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
-  %5 = load float* %arrayidx16, align 4, !tbaa !0
-  %6 = shl i64 %indvars.iv, 10
-  %7 = add nsw i64 %6, %indvars.iv88
-  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
-  %8 = load float* %arrayidx20, align 4, !tbaa !0
-  %mul21 = fmul float %5, %8
-  %add26 = fadd float %3, %mul21
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body12
-
-for.end:                                          ; preds = %for.body12
-  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
-  %9 = load float* %arrayidx34, align 4, !tbaa !0
-  %sub.i = fsub float %add26, %9
-  %fabsf.i = tail call float @fabsf(float %sub.i) #6
-  %10 = fpext float %fabsf.i to double
-  %cmp.i = fcmp olt double %10, 1.000000e-03
-  %indvars.iv.next89 = add i64 %indvars.iv88, 1
-  br i1 %cmp.i, label %for.cond4, label %if.then
-
-if.then:                                          ; preds = %for.end
-  %conv40 = fpext float %9 to double
-  %conv45 = fpext float %add26 to double
-  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
-  br label %return
-
-for.inc50:                                        ; preds = %for.cond4
-  %indvars.iv.next93 = add i64 %indvars.iv92, 1
-  %inc51 = add nsw i32 %i.081, 1
-  %11 = trunc i64 %indvars.iv.next93 to i32
-  %cmp = icmp slt i32 %11, 1024
-  br i1 %cmp, label %for.cond4.preheader, label %return
-
-return:                                           ; preds = %for.inc50, %if.then
-  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #1
-
-; --------------- VISC Intrinsics ---------------
-; Return Type of VISC Compute Matrix Mul
-%rtype = type {}
-%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, i32, i32, %rtype }>
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
-; ----------------- VISC intrinsics end ------------------
-
-; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
-entry:
-  ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
-  
-  ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
-  ; Replaced statement -- 
-  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
-  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
-  %this_node = call i8* @llvm.visc.getNode()
-  %Lx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
-  %Ly = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
-  %LLimitx = call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
-  %LLimity = call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
-
-  %parent_node = call i8* @llvm.visc.getParentNode(i8* %this_node)
-  %Gx = call i32 @llvm.visc.getNodeInstanceID.x(i8* %parent_node)
-  %Gy = call i32 @llvm.visc.getNodeInstanceID.y(i8* %parent_node)
-
-  %tmpx = mul i32 %Gx, %LLimitx
-  %tmpy = mul i32 %Gy, %LLimity
-
-  %call1 = add i32 %tmpx, %Lx
-  %call2 = add i32 %tmpy, %Ly
-
-  ;%printcall1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call1) #5
-  ;%printcall2 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @custom_str, i64 0, i64 0), i32 %call2) #5
-
-  ; ---------------------- VISC changes End ------------------
-
-  ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
-  %cmp44 = icmp eq i32 %k, 0
-  br i1 %cmp44, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %mul = mul i32 %call2, %k
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
-  %0 = trunc i64 %indvars.iv to i32
-  %add = add i32 %0, %mul
-  %mul4 = mul i32 %0, %n
-  %add5 = add i32 %mul4, %call1
-  ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  %1 = load float* %arrayidx, align 4, !tbaa !0
-  %idxprom11 = zext i32 %add5 to i64
-  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
-  %2 = load float* %arrayidx12, align 4, !tbaa !0
-  %mul13 = fmul float %1, %2
-  %add14 = fadd float %res.046, %mul13
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
-  ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
-  %mul16 = mul i32 %call2, %n
-  %add17 = add i32 %mul16, %call1
-  %idxprom18 = zext i32 %add17 to i64
-  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
-  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
-  ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
-  ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulInternal(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA, i32 %blocksize) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %blocksize, i32 %blocksize)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulRoot(float* in %h_A, i64 %bytes_A, float* in %h_B, i64 %bytes_B, float* out %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA, i32 %gridsize, i32 %blocksize) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32)* @MatrixMulInternal to i8*),i32 %gridsize, i32 %gridsize)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  call void @llvm.visc.bind.input(i8* %kernel, i32 10, i32 9); blocksize
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
-;entry:
-;  %cmp18 = icmp eq i32 %m, 0
-;  %cmp215 = icmp eq i32 %n, 0
-;  %or.cond = or i1 %cmp18, %cmp215
-;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.inc4.us:                                      ; preds = %for.body3.us
-;  %0 = extractvalue %rtype %call.us, 0
-;  %1 = extractvalue %rtype %call.us, 1
-;  %inc5.us = add i32 %i.019.us, 1
-;  %exitcond24 = icmp eq i32 %inc5.us, %m
-;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
-;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
-;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
-;  %inc.us = add i32 %j.016.us, 1
-;  %exitcond = icmp eq i32 %inc.us, %n
-;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
-;
-;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
-;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
-;  br label %for.body3.us
-;
-;for.end6:                                         ; preds = %for.inc4.us, %entry
-;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
-;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
-;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
-;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
-;  ret %rtype %.fca.1.insert
-;}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-entry:
-  tail call void @srand(i32 2006) #5
-  %call = tail call noalias i8* @malloc(i64 4194304) #5
-  %0 = bitcast i8* %call to float*
-  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i, %entry
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
-  %call.i = tail call i32 @rand() #5
-  %conv.i = sitofp i32 %call.i to float
-  %div.i = fmul float %conv.i, 0x3E00000000000000
-  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
-  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
-  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
-
-for.body.i40.preheader:                           ; preds = %for.body.i
-  %1 = bitcast i8* %call7 to float*
-  br label %for.body.i40
-
-for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
-  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
-  %call.i33 = tail call i32 @rand() #5
-  %conv.i34 = sitofp i32 %call.i33 to float
-  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
-  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
-  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
-  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
-  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
-
-randomInit.exit41:                                ; preds = %for.body.i40
-  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
-  %2 = bitcast i8* %call12 to float*
-
-  ; ---------------------------------- Adding VISC Launch Call --------------------------------
-  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
-  ; Setting up launch input args
-  call void @llvm.visc.init()
-  %in.addr = alloca %struct.arg
-
-  ; Store arguments
-  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-  %in.addr.gridsize = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  %in.addr.blocksize = getelementptr %struct.arg* %in.addr, i32 0, i32 10
-
-  store float* %0, float** %in.addr.h_A
-  store i64 4194304, i64* %in.addr.bytes_A
-  store float* %1, float** %in.addr.h_B
-  store i64 4194304, i64* %in.addr.bytes_B
-  store float* %2, float** %in.addr.h_C
-  store i64 4194304, i64* %in.addr.bytes_C
-  store i32 1024, i32* %in.addr.WA
-  store i32 1024, i32* %in.addr.WB
-  store i32 1024, i32* %in.addr.HA
-  store i32 64, i32* %in.addr.gridsize
-  store i32 16, i32* %in.addr.blocksize
-
-  ; Change type to i8* and VISC Launch call
-  %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-
-  ; Wait for result
-  call void @llvm.visc.wait(i8* %graphID)
-
-  ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 11
-  %out = load %rtype* %out.addr
-  call void @llvm.visc.cleanup()
-  ; -------------------------------- Completed VISC Launch Call --------------------------------
-
-  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
-  %tobool = icmp eq i32 %call14, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %randomInit.exit41
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
-  br label %if.end
-
-if.else:                                          ; preds = %randomInit.exit41
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
-  tail call void @free(i8* %call) #5
-  tail call void @free(i8* %call7) #5
-  tail call void @free(i8* %call12) #5
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @srand(i32) #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-declare float @fabsf(float)
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #5
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
-attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/MatrixMultiplication/visc_gemm_ptx.ll b/hpvm/test/MatrixMultiplication/visc_gemm_ptx.ll
deleted file mode 100644
index 0c3bc24f9dc5575783e3002115cc8976dfb3325a..0000000000000000000000000000000000000000
--- a/hpvm/test/MatrixMultiplication/visc_gemm_ptx.ll
+++ /dev/null
@@ -1,419 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_NVPTX.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-nvptx -dfg2llvm-x86 -clearDFG -o %t.ll -S %s
-; RUN: llvm-link %llvm_src/../libclc/built_libs/nvptx--nvidiacl.bc %s.kernels.ll -o %t.ll.kernels.linked.bc
-; RUN: clang -O3 -target nvptx %t.ll.kernels.linked.bc -S -o %s.nvptx.s
-; RUN: llvm-link %t.ll %llvm_src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
-; RUN: clang++ -O3 %t.linked.ll -lpthread -lOpenCL -lrt -o %t.bin
-; RUN: %t.bin
-; ModuleID = 'gemm_opencl.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@custom_str = private unnamed_addr constant [12 x i8] c"Value = %d\0A\00", align 1
-@hex_str = private unnamed_addr constant [14 x i8] c"Value = 0x%x\0A\00", align 1
-@ptr_str = private unnamed_addr constant [12 x i8] c"Value = %p\0A\00", align 1
-@.str = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
-@.str2 = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
-@.str3 = private unnamed_addr constant [32 x i8] c"Accessing k = %d, A[%d], B[%d]\0A\00", align 1
-@str = private unnamed_addr constant [17 x i8] c"Entered function\00"
-@str10 = private unnamed_addr constant [16 x i8] c"Result computed\00"
-@str11 = private unnamed_addr constant [20 x i8] c"Result written to C\00"
-@str12 = private unnamed_addr constant [17 x i8] c"Output allocated\00"
-@str13 = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
-@str14 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
-@str15 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
-
-; Function Attrs: nounwind uwtable
-define void @randomInit(float* nocapture %data, i32 %size) #0 {
-entry:
-  %cmp3 = icmp sgt i32 %size, 0
-  br i1 %cmp3, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %call = tail call i32 @rand() #5
-  %conv = sitofp i32 %call to float
-  %div = fmul float %conv, 0x3E00000000000000
-  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
-  store float %div, float* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind readnone uwtable
-define i32 @isEqual(float %a, float %b) #2 {
-entry:
-  %sub = fsub float %a, %b
-  %fabsf = tail call float @fabsf(float %sub) #6
-  %0 = fpext float %fabsf to double
-  %cmp = fcmp olt double %0, 1.000000e-03
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; Function Attrs: noinline nounwind uwtable
-define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #3 {
-entry:
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry, %for.inc50
-  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
-  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
-  %0 = shl nsw i64 %indvars.iv92, 10
-  br label %for.body7
-
-for.cond4:                                        ; preds = %for.end
-  %inc48 = add nsw i32 %j.079, 1
-  %1 = trunc i64 %indvars.iv.next89 to i32
-  %cmp5 = icmp slt i32 %1, 1024
-  br i1 %cmp5, label %for.body7, label %for.inc50
-
-for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
-  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
-  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
-  %2 = add nsw i64 %indvars.iv88, %0
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body7
-  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
-  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
-  %4 = add nsw i64 %indvars.iv, %0
-  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
-  %5 = load float* %arrayidx16, align 4, !tbaa !0
-  %6 = shl i64 %indvars.iv, 10
-  %7 = add nsw i64 %6, %indvars.iv88
-  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
-  %8 = load float* %arrayidx20, align 4, !tbaa !0
-  %mul21 = fmul float %5, %8
-  %add26 = fadd float %3, %mul21
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body12
-
-for.end:                                          ; preds = %for.body12
-  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
-  %9 = load float* %arrayidx34, align 4, !tbaa !0
-  %sub.i = fsub float %add26, %9
-  %fabsf.i = tail call float @fabsf(float %sub.i) #6
-  %10 = fpext float %fabsf.i to double
-  %cmp.i = fcmp olt double %10, 1.000000e-03
-  %indvars.iv.next89 = add i64 %indvars.iv88, 1
-  br i1 %cmp.i, label %for.cond4, label %if.then
-
-if.then:                                          ; preds = %for.end
-  %conv40 = fpext float %9 to double
-  %conv45 = fpext float %add26 to double
-  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #5
-  br label %return
-
-for.inc50:                                        ; preds = %for.cond4
-  %indvars.iv.next93 = add i64 %indvars.iv92, 1
-  %inc51 = add nsw i32 %i.081, 1
-  %11 = trunc i64 %indvars.iv.next93 to i32
-  %cmp = icmp slt i32 %11, 1024
-  br i1 %cmp, label %for.cond4.preheader, label %return
-
-return:                                           ; preds = %for.inc50, %if.then
-  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #1
-
-; --------------- VISC Intrinsics ---------------
-; Return Type of VISC Compute Matrix Mul
-%rtype = type {}
-%struct.arg = type <{ float*, i64, float*, i64, float*, i64, i32, i32, i32, %rtype }>
-
-; Function Attrs: nounwind
-declare void @llvm.visc.init() #1
-
-; Function Attrs: nounwind
-declare void @llvm.visc.cleanup() #1
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
-; ----------------- VISC intrinsics end ------------------
-
-; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* in nocapture %A, i64 %bytes_A, float* in nocapture %B, i64 %bytes_B, float* out %C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
-entry:
-  ;%puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str, i64 0, i64 0))
-  
-  ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed by getNumNodeInstances.x
-  ; Replaced statement -- 
-  ; -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #5
-  ; -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #5
-  %this_node = call i8* @llvm.visc.getNode()
-  %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
-  %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
-  ; ---------------------- VISC changes End ------------------
-
-  ;%call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.str2, i64 0, i64 0), i32 %call1, i32 %call2) #5
-  %cmp44 = icmp eq i32 %k, 0
-  br i1 %cmp44, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %entry
-  %mul = mul i32 %call2, %k
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %res.046 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add14, %for.body ]
-  %0 = trunc i64 %indvars.iv to i32
-  %add = add i32 %0, %mul
-  %mul4 = mul i32 %0, %n
-  %add5 = add i32 %mul4, %call1
-  ;%call6 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str3, i64 0, i64 0), i32 %k, i32 %add, i32 %add5) #5
-  %idxprom = zext i32 %add to i64
-  %arrayidx = getelementptr inbounds float* %A, i64 %idxprom
-  %1 = load float* %arrayidx, align 4, !tbaa !0
-  %idxprom11 = zext i32 %add5 to i64
-  %arrayidx12 = getelementptr inbounds float* %B, i64 %idxprom11
-  %2 = load float* %arrayidx12, align 4, !tbaa !0
-  %mul13 = fmul float %1, %2
-  %add14 = fadd float %res.046, %mul13
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add14, %for.body ]
-  ;%puts41 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @str10, i64 0, i64 0))
-  %mul16 = mul i32 %call2, %n
-  %add17 = add i32 %mul16, %call1
-  %idxprom18 = zext i32 %add17 to i64
-  %arrayidx19 = getelementptr inbounds float* %C, i64 %idxprom18
-  store float %res.0.lcssa, float* %arrayidx19, align 4, !tbaa !0
-  ;%puts42 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @str11, i64 0, i64 0))
-  ;%puts43 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @str12, i64 0, i64 0))
-  ret %rtype undef
-}
-
-; ----------------- VISC SGEMM root node ----------------
-define %rtype @MatrixMulRoot(float* %h_A, i64 %bytes_A, float* %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  ret %rtype undef
-}
-
-; Function Attrs: noinline nounwind uwtable
-;define %rtype @computeMatrixMul(float* nocapture %h_A, i64 %bytes_A, float* nocapture %h_B, i64 %bytes_B, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 %m) #3 {
-;entry:
-;  %cmp18 = icmp eq i32 %m, 0
-;  %cmp215 = icmp eq i32 %n, 0
-;  %or.cond = or i1 %cmp18, %cmp215
-;  br i1 %or.cond, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.inc4.us:                                      ; preds = %for.body3.us
-;  %0 = extractvalue %rtype %call.us, 0
-;  %1 = extractvalue %rtype %call.us, 1
-;  %inc5.us = add i32 %i.019.us, 1
-;  %exitcond24 = icmp eq i32 %inc5.us, %m
-;  br i1 %exitcond24, label %for.end6, label %for.body3.lr.ph.us
-;
-;for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
-;  %j.016.us = phi i32 [ 0, %for.body3.lr.ph.us ], [ %inc.us, %for.body3.us ]
-;  %call.us = tail call %rtype @matrixMul(float* %h_A, i64 undef, float* %h_B, i64 undef, float* %h_C, i64 %bytes_C, i32 %k, i32 %n, i32 undef, i32 undef, i32 undef)
-;  %inc.us = add i32 %j.016.us, 1
-;  %exitcond = icmp eq i32 %inc.us, %n
-;  br i1 %exitcond, label %for.inc4.us, label %for.body3.us
-;
-;for.body3.lr.ph.us:                               ; preds = %entry, %for.inc4.us
-;  %i.019.us = phi i32 [ %inc5.us, %for.inc4.us ], [ 0, %entry ]
-;  br label %for.body3.us
-;
-;for.end6:                                         ; preds = %for.inc4.us, %entry
-;  %Out.sroa.1.0.lcssa = phi i32 [ undef, %entry ], [ %1, %for.inc4.us ]
-;  %Out.sroa.0.0.lcssa = phi float* [ undef, %entry ], [ %0, %for.inc4.us ]
-;  %.fca.0.insert = insertvalue %rtype undef, float* %Out.sroa.0.0.lcssa, 0
-;  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %Out.sroa.1.0.lcssa, 1
-;  ret %rtype %.fca.1.insert
-;}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-entry:
-  tail call void @srand(i32 2006) #5
-  %call = tail call noalias i8* @malloc(i64 4194304) #5
-  %0 = bitcast i8* %call to float*
-  %call7 = tail call noalias i8* @malloc(i64 4194304) #5
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i, %entry
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
-  %call.i = tail call i32 @rand() #5
-  %conv.i = sitofp i32 %call.i to float
-  %div.i = fmul float %conv.i, 0x3E00000000000000
-  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
-  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv42 = trunc i64 %indvars.iv.next.i to i32
-  %exitcond43 = icmp eq i32 %lftr.wideiv42, 1048576
-  br i1 %exitcond43, label %for.body.i40.preheader, label %for.body.i
-
-for.body.i40.preheader:                           ; preds = %for.body.i
-  %1 = bitcast i8* %call7 to float*
-  br label %for.body.i40
-
-for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
-  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
-  %call.i33 = tail call i32 @rand() #5
-  %conv.i34 = sitofp i32 %call.i33 to float
-  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
-  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
-  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
-  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next.i37 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1048576
-  br i1 %exitcond, label %randomInit.exit41, label %for.body.i40
-
-randomInit.exit41:                                ; preds = %for.body.i40
-  %call12 = tail call noalias i8* @malloc(i64 4194304) #5
-  %2 = bitcast i8* %call12 to float*
-
-  ; ---------------------------------- Adding VISC Launch Call --------------------------------
-  ; Replaced - %out = tail call %rtype @computeMatrixMul(float* %0, i32 undef, float* %1, i32 undef, float* %2, i32 4194304, i32 1024, i32 1024, i32 1024)
-  ; Setting up launch input args
-  call void @llvm.visc.init()
-  %in.addr = alloca %struct.arg
-
-  ; Store arguments
-  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-
-  store float* %0, float** %in.addr.h_A
-  store i64 4194304, i64* %in.addr.bytes_A
-  store float* %1, float** %in.addr.h_B
-  store i64 4194304, i64* %in.addr.bytes_B
-  store float* %2, float** %in.addr.h_C
-  store i64 4194304, i64* %in.addr.bytes_C
-  store i32 1024, i32* %in.addr.WA
-  store i32 1024, i32* %in.addr.WB
-  store i32 1024, i32* %in.addr.HA
-
-  ; Change type to i8* and VISC Launch call
-  %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i64, float*, i64, float*, i64, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-
-  ; Wait for result
-  call void @llvm.visc.wait(i8* %graphID)
-
-  ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  %out = load %rtype* %out.addr
-  call void @llvm.visc.cleanup()
-  ; -------------------------------- Completed VISC Launch Call --------------------------------
-
-  %call14 = tail call i32 @checkResults(float* %0, float* %1, float* %2)
-  %tobool = icmp eq i32 %call14, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %randomInit.exit41
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str15, i64 0, i64 0))
-  br label %if.end
-
-if.else:                                          ; preds = %randomInit.exit41
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str13, i64 0, i64 0))
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str14, i64 0, i64 0))
-  tail call void @free(i8* %call) #5
-  tail call void @free(i8* %call7) #5
-  tail call void @free(i8* %call12) #5
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @srand(i32) #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-declare float @fabsf(float)
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #5
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
-attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/edge/CMakeLists.txt b/hpvm/test/edge/CMakeLists.txt
deleted file mode 100644
index 24dda49022d15ee1c1b3551046780aefe25595e8..0000000000000000000000000000000000000000
--- a/hpvm/test/edge/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-cmake_minimum_required(VERSION 2.8)
-project( edgeDetection )
-find_package( OpenCV REQUIRED )
-find_package( Threads REQUIRED )
-add_executable( PipelineEdgeDetect PipelineEdgeDetect.cpp )
-add_executable( EdgeDetect EdgeDetect.cpp )
-target_link_libraries( PipelineEdgeDetect ${OpenCV_LIBS} ${CMAKE_THREAD_LIBS_INIT} )
-target_link_libraries( EdgeDetect ${OpenCV_LIBS} )
-SET( CMAKE_CXX_FLAGS "-std=c++0x" )
diff --git a/hpvm/test/edge/ESO_Very_Large_Telescope.jpg b/hpvm/test/edge/ESO_Very_Large_Telescope.jpg
deleted file mode 100644
index 3aed6db383661c42ac69ebfea66b556c66ec934e..0000000000000000000000000000000000000000
Binary files a/hpvm/test/edge/ESO_Very_Large_Telescope.jpg and /dev/null differ
diff --git a/hpvm/test/edge/EdgeDetect.cpp b/hpvm/test/edge/EdgeDetect.cpp
deleted file mode 100644
index 62cca0cdd0be007087a2e90749289d2c3321354f..0000000000000000000000000000000000000000
--- a/hpvm/test/edge/EdgeDetect.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-#include "opencv2/opencv.hpp"
-#include "opencv2/core/ocl.hpp"
-#include <iostream>
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include "time.h"
-
-#define NUM_FRAMES 200
-
-using namespace cv;
-using namespace std;
-
-
-/// Global variables
-string window_name = "Edge Map";
-
-
-UMat* GaussianSmoothening(UMat* I, float Sn) {
-    // Gaussian Smoothening
-    UMat* IBlur = new UMat(I->rows, I->cols, I->type());
-    int kernelSize = 2*ceil(3*Sn)+1;
-    GaussianBlur(*I, *IBlur, Size(kernelSize, kernelSize), Sn);
-    return IBlur;
-}
-
-UMat* NonLinearLaplacian(UMat* IBlur, Mat B) {
-    UMat IErode, IDilate;
-    UMat* L = new UMat(IBlur->rows, IBlur->cols, IBlur->type());
-    erode(*IBlur, IErode, B);
-    dilate(*IBlur, IDilate, B);
-    //*L = IErode.getMat(ACCESS_READ) + IDilate.getMat(ACCESS_READ) - (2*IBlur->getMat(ACCESS_READ));
-    add(IErode, IDilate, IErode);
-    add(*IBlur, *IBlur, *IBlur);
-    subtract(IErode, *IBlur, *L);
-    IErode.release();
-    IDilate.release();
-    //UMat* L = new UMat(IBlur->rows, IBlur->cols, IBlur->type());
-    //Laplacian(*IBlur, *L, -1);
-    return L;
-}
-
-UMat* ZeroCrossings(UMat* I, Mat SE) {
-    UMat PErode, PDilate;
-    //Mat P = (*I >= 0);
-    UMat P;
-    compare(*I, 0, P, CMP_GE);
-    UMat* Z = new UMat(I->rows, I->cols, I->type());
-
-    erode(P, PErode, SE);
-    dilate(P, PDilate, SE);
-
-
-    //*Z = PDilate.getMat(ACCESS_READ)-PErode.getMat(ACCESS_READ);
-    subtract(PDilate, PErode, *Z);
-    P.release();
-    PDilate.release();
-    PErode.release();
-    return Z;
-}
-
-UMat* Gradient(UMat* I) {
-
-    UMat dIx, dIy;
-    UMat* dI = new UMat(I->rows, I->cols, I->type());
-    Sobel(*I, dIx, -1, 1, 0, 1);
-    Sobel(*I, dIy, -1, 0, 1, 1);
-    magnitude(dIx, dIy, *dI);
-    dIx.release();
-    dIy.release();
-    return dI;
-}
-
-UMat* RejectZeroCrossings(UMat* dI, UMat* Z, float Threshold) {
-    double dI_max;
-    UMat* E = new UMat(Z->rows, Z->cols, Z->type());
-    minMaxLoc(*dI, NULL, &dI_max);
-    //minMaxLoc(dI->getMat(ACCESS_READ), NULL, &dI_max);
-    UMat temp;
-    compare(*dI, Threshold*dI_max, temp, CMP_GT);
-    bitwise_and(*Z, temp, *E);
-    temp.release();
-    //*E = Z->getMat(ACCESS_READ) & temp.getMat(ACCESS_READ);
-    return E;
-}
-
-/* Edge Detect
- * Returns edges of image I in binary matrix E
- * Sn is the standard deviation of Gaussians needed for the filters
- * theta is the threshold used to determine whether there is an edge
- */
-UMat* EdgeDetect(UMat* I, float Sn, float Threshold, Mat B) {
-
-    // Gaussian Smoothening
-    UMat* IBlur = GaussianSmoothening(I, Sn);
-    //cout << "Show GS\n";
-    //imshow(window_name, *IBlur);
-    //waitKey(0);
-
-    // Gradient Computation
-    UMat* dI = Gradient(I);
-    //cout << "Show G\n";
-    //imshow(window_name, *dI);
-    //waitKey(0);
-
-    I->release();
-    // Non-Linear Laplacian Estimate
-    UMat* L = NonLinearLaplacian(IBlur, B);
-    //Mat* L = new Mat(IBlur->rows, IBlur->cols, IBlur->type());
-    //Laplacian(*IBlur, *L, -1);
-    //cout << "Show L\n";
-    //imshow(window_name, *L);
-    //waitKey(0);
-    IBlur->release();
-
-    // Find zero-crossings
-    UMat* Z = ZeroCrossings(L, B);
-    //cout << "Show ZC\n";
-    //imshow(window_name, *Z);
-    //waitKey(0);
-    L->release();
-
-    // Reject Zero Crossings
-    UMat* E = RejectZeroCrossings(dI, Z, Threshold);
-    //cout << "Show RZC\n";
-    //imshow(window_name, *E);
-    //waitKey(0);
-    Z->release();
-    dI->release();
-    return E;
-    //imshow(window_name, E);
-    //waitKey(0);
-}
-
-
-/** @function main */
-int main( int argc, char** argv )
-{
-    cout << "Accelerating Using OpenCV" << CV_VERSION << "\n";
-    ocl::setUseOpenCL(false);
-    cout << "OpenCL: " << ocl::useOpenCL() << "\n";
-    /// Load an image
-    Mat src = imread( argv[1] );
-
-    if( !src.data )
-    {
-        return -1;
-    }
-
-    /// Convert the image to grayscale
-    cvtColor( src, src, CV_BGR2GRAY );
-
-    /// Create a window
-    namedWindow( window_name, CV_WINDOW_AUTOSIZE );
-
-    src.convertTo(src, CV_32F, 1.0/255.0);
-    Mat B = getStructuringElement(MORPH_CROSS, Size(3,3));
-    //src.create(10000, 12000, CV_32FC1);
-    Timestamp start = get_time(TIMER_MT);
-    for(int i=0; i<NUM_FRAMES; i++) {
-        UMat srcUMat = src.getUMat(ACCESS_READ);
-        UMat* E = EdgeDetect(&srcUMat, 1.0, 0.1, B);
-        //imshow(window_name, *E);
-        //waitKey(0);
-        E->getMat(ACCESS_RW);
-        E->release();
-    }
-
-    Timestamp end = get_time(TIMER_MT);
-    cout << "Running time  = " << (end-start * 1.0)/BILLION  << " seconds\n";
-    /// Wait until user exit program by pressing a key
-    //waitKey(0);
-
-    return 0;
-}
diff --git a/hpvm/test/edge/PipelineEdgeDetect.cpp b/hpvm/test/edge/PipelineEdgeDetect.cpp
deleted file mode 100644
index 309902196731c270f5cb02ae9c7fc1c91580735f..0000000000000000000000000000000000000000
--- a/hpvm/test/edge/PipelineEdgeDetect.cpp
+++ /dev/null
@@ -1,481 +0,0 @@
-#include <opencv2/opencv.hpp>
-#include <opencv2/core/ocl.hpp>
-#include <cstdlib>
-#include <cstdio>
-#include <math.h>
-#include <iostream>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-#include <vector>
-#include "time.h"
-
-#define NUM_FRAMES 200
-#ifdef DEBUG
-#define DEBUG(X) X
-#else
-#define DEBUG(X)
-#endif
-
-#define OPENCL false
-
-using namespace std;
-using namespace cv;
-
-string window_name = "Edge Map";
-
-// Circular Buffer class
-template <class ElementType>
-class CircularBuffer {
-private:
-    int numElements;
-    int bufferSize;
-    int Head;
-    int Tail;
-    mutex mtx;
-    condition_variable cv;
-    vector<ElementType*> buffer;
-    string name;
-
-public:
-    CircularBuffer(int maxElements, string _name = "ANON_BUFFER") {
-        Head = 0;
-        Tail = 0;
-        numElements = 0;
-        bufferSize = maxElements+1;
-        name = _name;
-        buffer.reserve(bufferSize);
-    }
-
-    bool push(ElementType* E, string caller);
-    ElementType* pop(string caller);
-
-};
-
-template <class ElementType>
-bool CircularBuffer<ElementType>::push(ElementType* E, string caller) {
-    unique_lock<mutex> lk(mtx);
-    if((Head +1) % bufferSize == Tail) {
-        cv.wait(lk);
-    }
-    buffer[Head] = E;
-    Head = (Head+1) % bufferSize;
-    numElements++;
-    DEBUG(cout << "[" << caller << "]: " << name << " Total Elements = " << numElements << "\n");
-    lk.unlock();
-    cv.notify_one();
-    return true;
-}
-
-template <class ElementType>
-ElementType* CircularBuffer<ElementType>::pop(string caller) {
-    unique_lock<mutex> lk(mtx);
-    if(Tail == Head) {
-        //DEBUG(cout << "[C] Going to sleep ...\n");
-        cv.wait(lk);
-        //DEBUG(cout << "[C] Waking up\n");
-    }
-    ElementType* E = buffer[Tail];
-    Tail = (Tail + 1) % bufferSize;
-    numElements--;
-    DEBUG(cout << "[" << caller << "]: " << name << " Total Elements = " << numElements << "\n");
-    lk.unlock();
-    cv.notify_one();
-    return E;
-}
-// --------------------------------------------------------------------------
-
-
-// Read image from a file and convert it into gray sclae
-Mat readImage(char* file) {
-    Mat src = imread( file );
-
-    if( !src.data )
-    {
-        DEBUG(cout << "Error: Canot read input image " << file << "\n");
-        exit(-1);
-    }
-
-    /// Convert the image to grayscale
-    Mat src_gray;
-    cvtColor( src, src, CV_BGR2GRAY );
-    src.convertTo(src, CV_32F, 1.0/255.0);
-    return src;
-}
-
-// Gaussian Smoothening Node function
-void GaussianSmoothening(float Sn, CircularBuffer<Mat>* in_I, CircularBuffer<Mat>* out_IBlur, float* time) {
-    Timestamp start, end;
-    start = get_time(TIMER_THREAD);
-    ocl::setUseOpenCL(OPENCL);
-    DEBUG(cout << "Gaussian Smoothening Starts\n");
-    string id = "GS";
-    int i = 0;
-    while(i < NUM_FRAMES) {
-        //waitKey(0);
-        Mat* I = in_I->pop(id);
-        DEBUG(cout << "[" << id << "]: Iteration " << i << "\n");
-        int kernelSize = 2*ceil(3*Sn) + 1;
-        Mat* IBlur = new Mat(I->rows, I->cols, I->type());
-        GaussianBlur(*I, *IBlur, Size(kernelSize, kernelSize), Sn);
-        out_IBlur->push(IBlur, id);
-        I->release();
-        i++;
-    }
-    end = get_time(TIMER_THREAD);
-    *time = (end-start)*1.0/BILLION;
-    DEBUG(cout << "Gaussian Smoothening Done\n");
-}
-
-// Laplacian Node
-void NonLinearLaplacian(Mat B, CircularBuffer<Mat>* in_IBlur, CircularBuffer<Mat>* out_L, float* time, float* ctime) {
-    Timestamp start, end, t1, t2, get=0, copy =0, ed=0, allocation=0, calc=0, copyB=0, sub=0, push=0, release=0;
-
-    start = get_time(TIMER_THREAD);
-    ocl::setUseOpenCL(true);
-    DEBUG(cout << "Non-linear Laplacian Starts\n");
-    string id = "L";
-    int i = 0;
-    while(i < NUM_FRAMES) {
-        //waitKey(0);
-
-        // Waiting for data
-        //t1 = get_time(TIMER_MT);
-        Mat* IBlur = in_IBlur->pop(id);
-        //t2 = get_time(TIMER_MT);
-        //get += t2-t1;
-        DEBUG(cout << "[" << id << "]: Iteration " << i << "\n");
-
-        // Copy to GPU
-        UMat IErode, IDilate;
-        //t1 = get_time(TIMER_MT);
-        UMat IBlurU = IBlur->getUMat(ACCESS_READ);
-        //t2 = get_time(TIMER_MT);
-        //copy += t2-t1;
-        
-        // Erode dilate calculation
-        // Non-Linear Laplacian Estimate
-        //t1 = get_time(TIMER_MT);
-        erode(IBlurU, IErode, B);
-        dilate(IBlurU, IDilate, B);
-        //t2 = get_time(TIMER_MT);
-        //ed += t2 -t1;
-
-        // Allocation of new L matrix
-        //UMat* LU = new UMat(IBlur->rows, IBlur->cols, IBlur->type());
-        //t1 = get_time(TIMER_MT);
-        Mat* L = new Mat(IBlur->rows, IBlur->cols, IBlur->type());
-        //t2 = get_time(TIMER_MT);
-        //allocation += t2-t1;
-        //*L = IErode.getMat(ACCESS_READ) + IDilate.getMat(ACCESS_READ) - (2*(IBlur->getMat(ACCESS_READ)));
-
-        // Add calculation on GPU
-        //t1 = get_time(TIMER_MT);
-        add(IErode, IDilate, IErode);
-        add(IBlurU, IBlurU, IBlurU);
-        //t2 = get_time(TIMER_MT);
-        //calc += t2-t1;
-        //subtract(IErode, IBlurU, *LU);
-        
-        // Copy back may be??
-        //t1 = get_time(TIMER_MT);
-        UMat LU = L->getUMat(ACCESS_WRITE);
-        //t2 = get_time(TIMER_MT);
-        //copyB += t2 - t1;
-
-        // Sub on GPU
-        //t1 = get_time(TIMER_MT);
-        subtract(IErode, IBlurU, LU);
-        //t2 = get_time(TIMER_MT);
-        //sub += t2 - t1;
-
-        //Mat* L = new Mat(LU->getMat(ACCESS_READ));
-        //UMat* L = new UMat(IBlur->rows, IBlur->cols, IBlur->type());
-        //Laplacian(*IBlur, *L, -1);
-        // Pushing on buffer
-        //t1 = get_time(TIMER_MT);
-        out_L->push(L, id);
-        //t2 = get_time(TIMER_MT);
-        //push += t2 - t1;
-
-        // Freeing memory
-        //t1 = get_time(TIMER_MT);
-        IErode.release();
-        IDilate.release();
-        IBlur->release();
-        IBlurU.release();
-        //t2 = get_time(TIMER_MT);
-        release += t2 - t1;
-        //LU->release();
-        i++;
-    }
-    end = get_time(TIMER_THREAD);
-    *time = (end-start)*1.0/BILLION;
-    //*ctime = copy *1.0/BILLION;
-    //cout << "\t\t\tGET\t"<< get *1.0/BILLION << "\n";
-    //cout << "\t\t\tCOPY\t"<< copy *1.0/BILLION << "\n";
-    //cout << "\t\t\tED\t"<< ed *1.0/BILLION << "\n";
-    //cout << "\t\t\tALLOC\t"<< allocation *1.0/BILLION << "\n";
-    //cout << "\t\t\tCALC\t"<< calc *1.0/BILLION << "\n";
-    //cout << "\t\t\tCOPY-B\t"<< copyB *1.0/BILLION << "\n";
-    //cout << "\t\t\tSUB\t"<< sub *1.0/BILLION << "\n";
-    //cout << "\t\t\tPUSH\t"<< push *1.0/BILLION << "\n";
-    //cout << "\t\t\tFREE\t"<< release *1.0/BILLION << "\n";
-    DEBUG(cout << "Non-linear Laplacian Done\n");
-}
-
-// Gradient Node
-void Gradient(CircularBuffer<Mat>* in_I, CircularBuffer<Mat>* out_dI, float* time) {
-    Timestamp start, end;
-    start = get_time(TIMER_THREAD);
-    ocl::setUseOpenCL(OPENCL);
-    DEBUG(cout << "Gradient Starts\n");
-    string id = "G";
-    int i = 0;
-    while(i < NUM_FRAMES) {
-        //waitKey(0);
-        Mat* I = in_I->pop(id);
-        DEBUG(cout << "[" << id << "]: Iteration " << i << "\n");
-        // Gradient Computation
-        //Mat dIx, dIy;
-        UMat dIx, dIy;
-        UMat IU = I->getUMat(ACCESS_READ);
-        Sobel(IU, dIx, -1, 1, 0, 1);
-        Sobel(IU, dIy, -1, 0, 1, 1);
-        Mat* dI = new Mat(I->rows, I->cols, I->type());
-        magnitude(dIx, dIy, dI->getUMat(ACCESS_WRITE));
-        out_dI->push(dI, id);
-        dIx.release();
-        dIy.release();
-        I->release();
-        IU.release();
-        i++;
-    }
-    end = get_time(TIMER_THREAD);
-    *time = (end-start)*1.0/BILLION;
-    DEBUG(cout << "Gradient Done\n");
-}
-
-// Find Zero Crossings Node
-void ZeroCrossings(Mat SE, CircularBuffer<Mat>* in_L, CircularBuffer<Mat>* out_Z, float* time) {
-    Timestamp start, end;
-    start = get_time(TIMER_THREAD);
-    ocl::setUseOpenCL(OPENCL);
-    DEBUG(cout << "ZeroCrossings Start\n");
-    string id = "ZC";
-    int i = 0;
-    while(i < NUM_FRAMES) {
-        //waitKey(0);
-        Mat* L = in_L->pop(id);
-        DEBUG(cout << "[" << id << "]: Iteration " << i << "\n");
-
-        Mat PErode, PDilate;
-        Mat P;
-        compare(*L, 0, P, CMP_GE);
-        //Mat P = (*L >= 0);
-        erode(P, PErode, SE);
-        dilate(P, PDilate, SE);
-        //Mat* Z = new Mat(L->rows, L->cols, L->type());
-        //*Z = PDilate.getMat(ACCESS_READ)-PErode.getMat(ACCESS_READ);
-        Mat* Z = new Mat(L->rows, L->cols, L->type());
-        subtract(PDilate, PErode, *Z);
-        //*Z = PDilate.getMat(ACCESS_READ)-PErode.getMat(ACCESS_READ);
-        out_Z->push(Z, id);
-        L->release();
-        P.release();
-        PDilate.release();
-        PErode.release();
-        i++;
-    }
-    end = get_time(TIMER_THREAD);
-    *time = (end-start)*1.0/BILLION;
-    DEBUG(cout << "ZeroCrossings Done\n");
-
-}
-
-// Reject Zero crossings Node
-void RejectZeroCrossings(float Threshold, CircularBuffer<Mat>* in_dI,
-                         CircularBuffer<Mat>* in_Z, CircularBuffer<Mat>* out_E, float* time) {
-    Timestamp start, end;
-    start = get_time(TIMER_THREAD);
-    ocl::setUseOpenCL(OPENCL);
-    // Reject Zero Crossings
-    DEBUG(cout << "Reject Zero Crossings Starts\n");
-    string id = "RZC";
-    int i = 0;
-    while(i < NUM_FRAMES) {
-        //waitKey(0);
-        Mat* dI = in_dI->pop(id);
-        Mat* Z = in_Z->pop(id);
-        DEBUG(cout << "[" << id << "]: Iteration " << i << "\n");
-        double dI_max;
-        Mat* E = new Mat(Z->rows, Z->cols, Z->type());
-        minMaxLoc(*dI, NULL, &dI_max);
-
-        //*E = *Z & (dI->getMat(ACCESS_READ)> Threshold*dI_max);
-        Mat temp;
-        compare(*dI, Threshold*dI_max, temp, CMP_GT);
-        bitwise_and(*Z, temp, *E);
-        out_E->push(E, id);
-        dI->release();
-        Z->release();
-        temp.release();
-        i++;
-    }
-    end = get_time(TIMER_THREAD);
-    *time = (end-start)*1.0/BILLION;
-    DEBUG(cout << "Reject Zero Crossings Done\n");
-}
-
-// Producer thread, feeds in the same image NUM_FRAMES times to Gaussian and
-// Gradient node
-void producer_thread(Mat* I, CircularBuffer<Mat>* out_I_Gaussian, CircularBuffer<Mat>* out_I_Gradient, float* time) {
-    Timestamp start, end;
-    start = get_time(TIMER_THREAD);
-    ocl::setUseOpenCL(OPENCL);
-    DEBUG(cout << "Producer Start\n");
-    string id = "P";
-    int i = 0;
-    //Mat UI = I->getMat(ACCESS_READ);
-    while(i < NUM_FRAMES) {
-        //waitKey(0);
-        DEBUG(cout << "[" << id << "]: Iteration " << i << "\n");
-        Mat* I1 = new Mat(I->rows, I->cols, I->type());
-        Mat* I2 = new Mat(I->rows, I->cols, I->type());
-        I->copyTo(*I1);
-        I->copyTo(*I2);
-        out_I_Gaussian->push(I1, id);
-        out_I_Gradient->push(I2, id);
-        i++;
-    }
-
-    //UI.release();
-    I->release();
-    end = get_time(TIMER_THREAD);
-    *time = (end-start)*1.0/BILLION;
-    DEBUG(cout << "Producer Done!\n");
-}
-
-// Consumer Node for displaying the image
-void consumer_thread(CircularBuffer<Mat>* in_E, float* time) {
-    Timestamp start, end;
-    start = get_time(TIMER_THREAD);
-    ocl::setUseOpenCL(OPENCL);
-    DEBUG(cout << "Consumer Start\n");
-    string id = "C";
-    int i = 0;
-    while(i < NUM_FRAMES) {
-        //waitKey(0);
-        Mat* E = in_E->pop(id);
-        //E->getMat(ACCESS_RW);
-        DEBUG(cout << "[" << id << "]: Iteration " << i << "\n");
-        //imshow(window_name, *E);
-        //waitKey(0);
-        E->release();
-        i++;
-    }
-    end = get_time(TIMER_THREAD);
-    *time = (end-start)*1.0/BILLION;
-    DEBUG(cout << "Consumer Done!\n");
-}
-
-
-int main(int argc, char* argv[]) {
-    int bufferSize = stoi(argv[1]);
-    int iterations = 1;
-    if(argc == 4)
-      iterations = stoi(argv[3]);
-    float Threshold = 0.1;
-    float Sn = 1.0;
-
-    DEBUG(cout << "Accelerating Using OpenCV" << CV_VERSION << "\n");
-    ocl::setUseOpenCL(OPENCL);
-    cout << "Use OpenCL: " << ocl::useOpenCL() << "\n";
-
-    /// Load an image
-    Timestamp start, end;
-    //start = get_time(TIMER_MT);
-    //end = get_time(TIMER_MT);
-    //cout << "Running time  = " << (end-start * 1.0)/BILLION  << " seconds\n";
-    Mat B = getStructuringElement(MORPH_CROSS, Size(3,3));
-
-    namedWindow( window_name, CV_WINDOW_AUTOSIZE );
-    DEBUG(cout << "Main Starts\n");
-
-    float tGS, tG, tL, tRZC, tZC, tP, tC, tMain, tCopy;
-    float aggtGS=0, aggtG=0, aggtL=0, aggtRZC=0, aggtZC=0, aggtP=0, aggtC=0, aggtMain=0;
-    for(int i = 0; i<iterations+1; i++) {
-        Mat src1 = readImage(argv[2]);
-        //cout << "Image Size = " << src1.rows << ", " << src1.cols << "\n";
-        start = get_time(TIMER_MT);
-        // Producer
-        CircularBuffer<Mat> Producer_Gaussian_I(bufferSize, "Producer-->Gaussian-I");
-        CircularBuffer<Mat> Producer_Gradient_I(bufferSize, "Producer-->Gradient-I");
-        thread producer(producer_thread, &src1, &Producer_Gaussian_I, &Producer_Gradient_I, &tP);
-
-        // Gaussian Smoothening Node
-        CircularBuffer<Mat> Gaussian_Laplacian_IBlur(bufferSize, "Gaussian-->Laplacian-IBlur");
-        thread GaussianNode(GaussianSmoothening, Sn, &Producer_Gaussian_I, &Gaussian_Laplacian_IBlur, &tGS);
-
-        // Gradient Node
-        CircularBuffer<Mat> Gradient_Reject_dI(bufferSize, "Gradient-->Reject-dI");
-        thread GradientNode(Gradient, &Producer_Gradient_I, &Gradient_Reject_dI, &tG);
-
-        // Laplacian Node
-        CircularBuffer<Mat> Laplacian_Zero_L(bufferSize, "Laplacian-->Zero-L");
-        thread LaplacianNode(NonLinearLaplacian, B, &Gaussian_Laplacian_IBlur, &Laplacian_Zero_L, &tL, &tCopy);
-
-        // Zero Crossings Node
-        CircularBuffer<Mat> Zero_Reject_Z(bufferSize, "Zero-->Reject-Z");
-        thread ZeroCrossingsNode(ZeroCrossings, B, &Laplacian_Zero_L, &Zero_Reject_Z, &tZC);
-
-        // Reject Zero Crossings Node
-        CircularBuffer<Mat> Reject_Consumer_E(bufferSize, "Reject-->Consumer-E");
-        thread RejectZeroNode(RejectZeroCrossings, Threshold, &Gradient_Reject_dI, &Zero_Reject_Z, &Reject_Consumer_E, &tRZC);
-
-        // Consumer
-        thread consumer(consumer_thread, &Reject_Consumer_E, &tC);
-
-        // Wait for threads to finish
-        producer.join();
-        GaussianNode.join();
-        GradientNode.join();
-        LaplacianNode.join();
-        ZeroCrossingsNode.join();
-        RejectZeroNode.join();
-        consumer.join();
-
-        end = get_time(TIMER_MT);
-        tMain = (end-start*1.0)/BILLION;
-        // Skip first iteration numbers to avoid first iteration which sometimes
-        // get scheduled entirely on CPU
-        if(i > 0) {
-          aggtMain += tMain;
-          aggtGS += tGS;
-          aggtG += tG;
-          aggtL += tL;
-          aggtZC += tZC;
-          aggtRZC += tRZC;
-          aggtP += tP;
-          aggtC += tC;
-        }
-        //cout << i << ":\t(Main)\t" << tMain << "\n";
-        //cout << i << ":\t\t(L)\t" << tL << "\n";
-        //cout << i << ":\t\t\t(Copy)\t"<< tCopy << "\n";
-        //cout << i << ":\t\t(G)\t" << tG << "\n";
-        //cout << i << ":\t\t(GS)\t" << tGS << "\n";
-        //cout << i << ":\t\t(ZC)\t" << tZC << "\n";
-    }
-    cout << "Main Running time  = " << aggtMain/iterations << " seconds\n";
-    cout << "GS Running time  = " << aggtGS/iterations  << " seconds\n";
-    cout << "G Running time  = " << aggtG/iterations  << " seconds\n";
-    cout << "L Running time  = " << aggtL/iterations  << " seconds\n";
-    cout << "ZC Running time  = " << aggtZC/iterations  << " seconds\n";
-    cout << "RZC Running time  = " << aggtRZC/iterations  << " seconds\n";
-    cout << "P Running time  = " << aggtP/iterations  << " seconds\n";
-    cout << "C Running time  = " << aggtC/iterations  << " seconds\n";
-
-    cout << "Total Running time = " << (aggtGS+aggtG+aggtL+aggtZC+aggtRZC+aggtP+aggtC)/iterations << "\n";
-    /// Create a window
-    return 0;
-}
diff --git a/hpvm/test/edge/edgetest_10.png b/hpvm/test/edge/edgetest_10.png
deleted file mode 100644
index 2632a72305ed0bb69995ff1af2bd2fe1c2bdbc46..0000000000000000000000000000000000000000
Binary files a/hpvm/test/edge/edgetest_10.png and /dev/null differ
diff --git a/hpvm/test/edge/house.png b/hpvm/test/edge/house.png
deleted file mode 100644
index 6e38af9b6c77a21e23967b24444ad9262a3e6047..0000000000000000000000000000000000000000
Binary files a/hpvm/test/edge/house.png and /dev/null differ
diff --git a/hpvm/test/edge/time.h b/hpvm/test/edge/time.h
deleted file mode 100644
index 53b547e8d8c120e67dc8c30e1094e40d7a40a722..0000000000000000000000000000000000000000
--- a/hpvm/test/edge/time.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <ctime>
-
-typedef unsigned long long Timestamp; /* time in microseconds */
-#define BILLION   1000000000LL
-
-#define TIMER_RT CLOCK_REALTIME
-#define TIMER_MT CLOCK_MONOTONIC
-#define TIMER_PROCESS CLOCK_PROCESS_CPUTIME_ID
-#define TIMER_THREAD CLOCK_THREAD_CPUTIME_ID
-
-static Timestamp get_time(clockid_t timer)
-{
-    struct timespec tv;
-    clock_gettime(timer, &tv);
-    return (Timestamp) (tv.tv_sec * BILLION + tv.tv_nsec);
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul/Makefile b/hpvm/test/gemm_opencl/matrixMul/Makefile
deleted file mode 100644
index eb97b153334f5886a08780ef7b9a8ebbc8d05e7c..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-PASSES :=
-
-.PHONY: clean
-
-LLVM_INSTALL:=/home/psrivas2/visc/llvm-install
-LIBCLC:=/home/psrivas2/visc/libclc
-HOST:=gemm_opencl
-KERNELS:=kernel
-LLVM_CC:=$(LLVM_INSTALL)/bin/clang
-LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
-
-all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin)
-
-$(KERNELS:%=%.ll):%.ll:%.cl
-	$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
-
-$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
-	$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
-
-$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
-	$(LLVM_CC) -O3 -target nvptx $< -S -o $@
-
-$(HOST:%=%.ll):%.ll:%.c
-	$(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@
-
-$(HOST:%=%.bin):%.bin:%.c
-	$(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@
-
-clean :
-	rm -f $(HOST).ll $(KERNELS).ll *.bc *.s *.bin
diff --git a/hpvm/test/gemm_opencl/matrixMul/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul/gemm_opencl.c
deleted file mode 100644
index 92ee91ed7d718aa677cd598fc37586c037cd253c..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul/gemm_opencl.c
+++ /dev/null
@@ -1,323 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-#include <CL/cl.h>
-
-#define WA 1024
-#define HA 1024
-#define WB 1024
-#define HB WA
-#define WC WB
-#define HC HA
-
-
-
-// Thread block size
-#define BLOCK_SIZE 16
-
-static inline void checkErr(cl_int err, cl_int success, const char * name) {
-  if (err != success) {
-    fprintf(stderr, "ERROR: %s\nErrorcode: %d\n", name, err);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Loads a Program file.
-//!
-//! @return the source string if succeeded, 0 otherwise
-//! @param cFilename        program filename
-//! @param szFinalLength    returned length of the code string
-//////////////////////////////////////////////////////////////////////////////
-char* LoadProgSource(const char* cFilename, size_t* szFinalLength)
-{
-    // locals
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0)
-        {
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0)
-        {
-            return NULL;
-        }
-    #endif
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END);
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET);
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + 1);
-    if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength;
-    }
-    cSourceString[szSourceLength] = '\0';
-
-    return cSourceString;
-}
-
-// Check bool
-int isEqual(float a, float b) {
-  return (fabs(a-b) < 0.001);
-}
-
-// Check Results
-
-int checkResults(float* A, float* B, float* C) {
-  unsigned int size_A = WA * HA;
-  unsigned int size_B = WB * HB;
-  unsigned int size_C = WC * HC;
-  unsigned int bytesC = sizeof(float) * size_C;
-  float* goldC = (float*) malloc(bytesC);
-  for (int i=0; i < HC; i++) {
-    for (int j=0; j < WC; j++) {
-      goldC[i*WC + j] = 0;
-      for (int k=0; k < HB; k++) {
-        goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
-      }
-      if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
-        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
-        return 0;
-      }
-    }
-  }
-  return 1; // Success
-}
-
-// GPU Computation of MatrixMul
-void computeMatrixMul(float* h_A, unsigned bytes_A, float* h_B, unsigned bytes_B, float* h_C, unsigned bytes_C) {
- // OpenCL specific variables
-  cl_context clGPUContext;
-  cl_command_queue clCommandQue;
-  cl_program clProgram;
-  cl_kernel clKernel;
-
-  size_t dataBytes;
-  size_t kernelLength;
-  cl_int errcode;
-
-  // OpenCL device memory for matrices
-  cl_mem d_A;
-  cl_mem d_B;
-  cl_mem d_C;
-
-  /*****************************************/
-  /* Initialize OpenCL */
-  /*****************************************/
-  // query the number of platforms
-  cl_uint numPlatforms;
-  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
-  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
-
-  // now get all the platform IDs
-  cl_platform_id platforms[numPlatforms];
-  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
-
-  for(unsigned i=0; i < numPlatforms; i++) {
-    char buffer[10240];
-    //printf("  -- %d --\n", i);
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
-    //printf("  PROFILE = %s\n", buffer);
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
-    //printf("  VERSION = %s\n", buffer);
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
-    //printf("  NAME = %s\n", buffer);
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
-    //printf("  VENDOR = %s\n", buffer);
-    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
-    //printf("  EXTENSIONS = %s\n", buffer);
-  }
-  // set platform property - just pick the first one
-  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
-                                        (int) platforms[0],
-                                        0};
-  clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
-                                         NULL, NULL, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
-
-  // get the list of GPU devices associated with context
-  errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0,
-                              NULL, &dataBytes);
-  cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
-  errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes,
-                              clDevices, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to get context info");
-
-  //Create a command-queue
-  clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
-
-  // Setup device memory
-  d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
-                       &errcode);
-  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                       bytes_A, h_A, &errcode);
-  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
-                       bytes_B, h_B, &errcode);
-
-
-   // Load and build OpenCL kernel
-  /*char *clMatrixMul = LoadProgSource("matrixMul.cl",
-                                        "// My comment\n",
-                                        &kernelLength);*/
-  //checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program");
-
-  /*clProgram = clCreateProgramWithSource(clGPUContext, 1,
-                                        (const char **)&clMatrixMul,
-                                        &kernelLength, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create program from source");
-*/
-  size_t binaryLength;
-  char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &binaryLength);
-  checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary");
-  
-  cl_int binaryStatus;
-  clProgram = clCreateProgramWithBinary(clGPUContext, 1, &clDevices[0],
-                                        &binaryLength,
-                                        (const unsigned char **)&clMatrixMul,
-                                        &binaryStatus, &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
-
-  errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to build program");
-
-  clKernel = clCreateKernel(clProgram, "matrixMul", &errcode);
-  checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
-
-
-  // Launch OpenCL kernel
-  size_t localWorkSize[2], globalWorkSize[2];
- 
-  int wA = WA;
-  int wC = WC;
-  errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
-  errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
-  errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
-  errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA);
-  errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC);
-  checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments");
- 
-  localWorkSize[0] = BLOCK_SIZE;
-  localWorkSize[1] = BLOCK_SIZE;
-  globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
-  globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
- 
-  errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
-                                   globalWorkSize, localWorkSize,
-                                   0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
- 
-  // Retrieve result from device
-  errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
-                                h_C, 0, NULL, NULL);
-  checkErr(errcode, CL_SUCCESS, "Failure to read buffer");
- 
-  // Print out the result
-/*
-  printf("\n\nMatrix C (Result)\n");
-  for(int i = 0; i < size_C; i++) {
-    printf("%f ", h_C[i]);
-    if(((i + 1) % WC) == 0)
-    printf("\n");
-  }
-  printf("\n");
- */
-  // Deallocate memory
-
-  clReleaseMemObject(d_A);
-  clReleaseMemObject(d_C);
-  clReleaseMemObject(d_B);
-
-  free(clDevices);
-  free(clMatrixMul);
-  clReleaseContext(clGPUContext);
-  clReleaseKernel(clKernel);
-  clReleaseProgram(clProgram);
-  clReleaseCommandQueue(clCommandQue);
-
-}
-
-// Main
-int main(int argc, char** argv) {
-
-  // seed for rand()
-  srand(2006);
-
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
-  unsigned int bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
-
-  unsigned int size_B = WB * HB;
-  unsigned int bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
-
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/*
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
-
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
-
-  // Allocate host memory for the result matrix C
-  unsigned int size_C = WC * HC;
-  unsigned int bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
-
-   // Compute using OpenCL
-  computeMatrixMul(h_A, bytes_A, h_B, bytes_B, h_C, bytes_C);
-
-  if(checkResults(h_A, h_B, h_C))
-    printf("\nPass!\n");
-  else
-    printf("\nFailed!\n");
-  printf("\nDone!\n");
-
-  // Deallocate memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul/matrixMul.cl b/hpvm/test/gemm_opencl/matrixMul/matrixMul.cl
deleted file mode 100644
index 7ca1d3e347e30b236bcd935cc06f5cfead1b68f0..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul/matrixMul.cl
+++ /dev/null
@@ -1,17 +0,0 @@
-// OpenCL Kernel for matrix multiply, C = A * B
-__kernel void _Z9mysgemmNTPfS_(__global float* A,
-                        size_t bytesA,
-                        __global float* B,
-                        size_t bytesB
-                        ) {
-
-  int tx = get_global_id(0); //2D Global Thread ID x
-  // Initialize accumulator
-
-  // Perform dot-product of row-column
-  // Write in device memory
-  B[tx] = (A[tx] <= B[tx])? B[tx]:A[tx]+B[tx];
-    /*B[tx] = A[tx];*/
-
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul/sgemm.c b/hpvm/test/gemm_opencl/matrixMul/sgemm.c
deleted file mode 100644
index c1c3a300668b94f904393074bb92874715ac5e25..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul/sgemm.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/* 
- * Main entry of vector addition kernel
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void matrixMultiply(float *C, float *A, float *B, int k, int n);
-
-/*
-// Host matrix multiply
-void matrixMulHost (int m, k, n, int *A, int *B, int *C) {
-
-  for (int i = 0; i < m; i++)
-  for (int j = 0; j < n; j++)
-  for (int t = 0; t < k; t++)
-    C[i*n + j] = A[i*k + t] + B[t*n + j];
-
-  return;
-}
-*/
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
-
-int main (int argc, char *argv[]) {
-  int m = atoi(argv[1]);
-  int k = atoi(argv[2]);
-  int n = atoi(argv[3]);
-
-  float *A, *B, *C;
-
-  /********************************************************************
-  Allocate memory and initialize the input/output vectors
-  ********************************************************************/
-
-  A = (float *) malloc(m*k*sizeof(float));
-  B = (float *) malloc(k*n*sizeof(float));
-  C = (float *) malloc(m*n*sizeof(float));
-
-  randomInit(A, m*k);
-  randomInit(B, k*n);
-
-  matrixMultiply(C, A, B, k, n);
-    
-  /********************************************************************
-  Free memory allocations
-  ********************************************************************/
-
-  free(A); free(B); free(C);
-
-  return 0;
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul/visc_gemm_opencl.ll b/hpvm/test/gemm_opencl/matrixMul/visc_gemm_opencl.ll
deleted file mode 100644
index fe287e55ac68f3677e6a2eb528657f2b4e791672..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul/visc_gemm_opencl.ll
+++ /dev/null
@@ -1,448 +0,0 @@
-; RUN: opt -load LLVMBuildDFG.so -load LLVMDFG2LLVM_X86.so -load LLVMClearDFG.so -dfg2llvm-x86 -clearDFG -o %t.ll -S < %s
-; RUN: llvm-link %t.ll ~/current-src/projects/visc-rt/visc-rt.ll -S -o %t.linked.ll
-; RUN: clang -O3 %t.linked.ll -lpthread -lOpenCL -o %t.bin
-; RUN: %t.bin
-; ModuleID = 'gemm_opencl.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@.str1 = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
-@str = private unnamed_addr constant [9 x i8] c"\0AFailed!\00"
-@str26 = private unnamed_addr constant [7 x i8] c"\0ADone!\00"
-@str27 = private unnamed_addr constant [7 x i8] c"\0APass!\00"
-
-; Function Attrs: nounwind uwtable
-define void @randomInit(float* nocapture %data, i32 %size) #0 {
-entry:
-  %cmp3 = icmp sgt i32 %size, 0
-  br i1 %cmp3, label %for.body, label %for.end
-
-for.body:                                         ; preds = %entry, %for.body
-  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
-  %call = tail call i32 @rand() #4
-  %conv = sitofp i32 %call to float
-  %div = fmul float %conv, 0x3E00000000000000
-  %arrayidx = getelementptr inbounds float* %data, i64 %indvars.iv
-  store float %div, float* %arrayidx, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %size
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  ret void
-}
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-; Function Attrs: nounwind readnone uwtable
-define i32 @isEqual(float %a, float %b) #2 {
-entry:
-  %sub = fsub float %a, %b
-  %fabsf = tail call float @fabsf(float %sub) #6
-  %0 = fpext float %fabsf to double
-  %cmp = fcmp olt double %0, 1.000000e-03
-  %conv1 = zext i1 %cmp to i32
-  ret i32 %conv1
-}
-
-; Function Attrs: nounwind uwtable
-define i32 @checkResults(float* nocapture %A, float* nocapture %B, float* nocapture %C) #0 {
-entry:
-  br label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %entry, %for.inc50
-  %indvars.iv92 = phi i64 [ 0, %entry ], [ %indvars.iv.next93, %for.inc50 ]
-  %i.081 = phi i32 [ 0, %entry ], [ %inc51, %for.inc50 ]
-  %0 = shl nsw i64 %indvars.iv92, 10
-  br label %for.body7
-
-for.cond4:                                        ; preds = %for.end
-  %inc48 = add nsw i32 %j.079, 1
-  %1 = trunc i64 %indvars.iv.next89 to i32
-  %cmp5 = icmp slt i32 %1, 1024
-  br i1 %cmp5, label %for.body7, label %for.inc50
-
-for.body7:                                        ; preds = %for.cond4.preheader, %for.cond4
-  %indvars.iv88 = phi i64 [ 0, %for.cond4.preheader ], [ %indvars.iv.next89, %for.cond4 ]
-  %j.079 = phi i32 [ 0, %for.cond4.preheader ], [ %inc48, %for.cond4 ]
-  %2 = add nsw i64 %indvars.iv88, %0
-  br label %for.body12
-
-for.body12:                                       ; preds = %for.body12, %for.body7
-  %indvars.iv = phi i64 [ 0, %for.body7 ], [ %indvars.iv.next, %for.body12 ]
-  %3 = phi float [ 0.000000e+00, %for.body7 ], [ %add26, %for.body12 ]
-  %4 = add nsw i64 %indvars.iv, %0
-  %arrayidx16 = getelementptr inbounds float* %A, i64 %4
-  %5 = load float* %arrayidx16, align 4, !tbaa !0
-  %6 = shl i64 %indvars.iv, 10
-  %7 = add nsw i64 %6, %indvars.iv88
-  %arrayidx20 = getelementptr inbounds float* %B, i64 %7
-  %8 = load float* %arrayidx20, align 4, !tbaa !0
-  %mul21 = fmul float %5, %8
-  %add26 = fadd float %3, %mul21
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end, label %for.body12
-
-for.end:                                          ; preds = %for.body12
-  %arrayidx34 = getelementptr inbounds float* %C, i64 %2
-  %9 = load float* %arrayidx34, align 4, !tbaa !0
-  %sub.i = fsub float %add26, %9
-  %fabsf.i = tail call float @fabsf(float %sub.i) #6
-  %10 = fpext float %fabsf.i to double
-  %cmp.i = fcmp olt double %10, 1.000000e-03
-  %indvars.iv.next89 = add i64 %indvars.iv88, 1
-  br i1 %cmp.i, label %for.cond4, label %if.then
-
-if.then:                                          ; preds = %for.end
-  %conv40 = fpext float %9 to double
-  %conv45 = fpext float %add26 to double
-  %call46 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str1, i64 0, i64 0), i32 %i.081, i32 %j.079, double %conv40, double %conv45) #4
-  br label %return
-
-for.inc50:                                        ; preds = %for.cond4
-  %indvars.iv.next93 = add i64 %indvars.iv92, 1
-  %inc51 = add nsw i32 %i.081, 1
-  %11 = trunc i64 %indvars.iv.next93 to i32
-  %cmp = icmp slt i32 %11, 1024
-  br i1 %cmp, label %for.cond4.preheader, label %return
-
-return:                                           ; preds = %for.inc50, %if.then
-  %retval.0 = phi i32 [ 0, %if.then ], [ 1, %for.inc50 ]
-  ret i32 %retval.0
-}
-
-; Function Attrs: nounwind
-declare i32 @printf(i8* nocapture, ...) #1
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #4
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #4
-
-; --------------- VISC Intrinsics ---------------
-; Return Type of VISC Compute Matrix Mul
-%rtype = type {float*, i32}
-%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype }
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode1D(i8*, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getNode() #0
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.getParentNode(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNumDims(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.x(i8*) #0
-
-; Function Attrs: nounwind
-declare i32 @llvm.visc.getNodeInstanceID.y(i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.input(i8*, i32, i32)
-
-; Function Attrs: nounwind
-declare void @llvm.visc.bind.output(i8*, i32, i32)
-; ----------------- VISC intrinsics end ------------------
-
-@.strce = private unnamed_addr constant [28 x i8] c"Computing element (%d, %d)\0A\00", align 1
-@stref = private unnamed_addr constant [17 x i8] c"Entered function\00"
-@strrc = private unnamed_addr constant [16 x i8] c"Result computed\00"
-@strrw = private unnamed_addr constant [20 x i8] c"Result written to C\00"
-@stroa = private unnamed_addr constant [17 x i8] c"Output allocated\00"
-
-; Function Attrs: nounwind uwtable
-define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
-entry:
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @stref, i64 0, i64 0))
-
-  ; ------------------------- VISC changes ------------------
-  ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
-  ; Replaced statement -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #3
-  %this_node = call i8* @llvm.visc.getNode()
-  %call1 = call i32 @llvm.visc.getNodeInstanceID.x(i8* %this_node)
-  
-  ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
-  ; Replaced statement -- %call2 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #3
-  %call2 = call i32 @llvm.visc.getNodeInstanceID.y(i8* %this_node)
-  ; ---------------------- VISC changes End ------------------
-  %call3 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i32 %call1, i32 %call2) #3
-  %cmp32 = icmp sgt i32 %k, 0
-  br i1 %cmp32, label %for.body.lr.ph, label %for.end
-
-for.body.lr.ph:                                   ; preds = %entry
-  %mul = mul nsw i32 %call2, %k
-  %0 = sext i32 %mul to i64
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %for.body.lr.ph
-  %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
-  %res.034 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add9, %for.body ]
-  ;%calln = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([28 x i8]* @.strce, i64 0, i64 0), i64 %indvars.iv, i32 %call2) #6
-  %1 = add nsw i64 %indvars.iv, %0
-  %arrayidx = getelementptr inbounds float* %A, i64 %1
-  %2 = load float* %arrayidx, align 4, !tbaa !0
-  %3 = trunc i64 %indvars.iv to i32
-  %mul4 = mul nsw i32 %3, %n
-  %add5 = add nsw i32 %mul4, %call1
-  %idxprom6 = sext i32 %add5 to i64
-  %arrayidx7 = getelementptr inbounds float* %B, i64 %idxprom6
-  %4 = load float* %arrayidx7, align 4, !tbaa !0
-  %mul8 = fmul float %2, %4
-  %add9 = fadd float %res.034, %mul8
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %k
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body, %entry
-  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add9, %for.body ]
-  %puts29 = tail call i32 @puts(i8* getelementptr inbounds ([16 x i8]* @strrc, i64 0, i64 0))
-  %mul11 = mul nsw i32 %call2, %n
-  %add12 = add nsw i32 %mul11, %call1
-  %idxprom13 = sext i32 %add12 to i64
-  %arrayidx14 = getelementptr inbounds float* %C, i64 %idxprom13
-  store float %res.0.lcssa, float* %arrayidx14, align 4, !tbaa !0
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([20 x i8]* @strrw, i64 0, i64 0))
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([17 x i8]* @stroa, i64 0, i64 0))
-  %.fca.0.insert = insertvalue %rtype undef, float* %C, 0
-  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1
-  ret %rtype %.fca.1.insert
-}
-
-define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
-  %kernel = call i8* @llvm.visc.createNode2D(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @matrixMul to i8*), i32 %WB, i32 %HA)
-  ; Bind Inputs
-  call void @llvm.visc.bind.input(i8* %kernel, i32 0, i32 0); h_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 1, i32 1); bytes_A
-  call void @llvm.visc.bind.input(i8* %kernel, i32 2, i32 2); h_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 3, i32 3); bytes_B
-  call void @llvm.visc.bind.input(i8* %kernel, i32 4, i32 4); h_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 5, i32 5); bytes_C
-  call void @llvm.visc.bind.input(i8* %kernel, i32 6, i32 6); WA = HB = k
-  call void @llvm.visc.bind.input(i8* %kernel, i32 7, i32 7); WB = WC = n
-  call void @llvm.visc.bind.input(i8* %kernel, i32 8, i32 8); HA = HC = m
-  ; Bind Outputs
-  call void @llvm.visc.bind.output(i8* %kernel, i32 0, i32 0); d_C
-  call void @llvm.visc.bind.output(i8* %kernel, i32 1, i32 1); bytes_C
-  ret %rtype zeroinitializer
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-entry:
-  tail call void @srand(i32 2006) #4
-  %call = tail call noalias i8* @malloc(i64 4194304) #4
-  %0 = bitcast i8* %call to float*
-  %call7 = tail call noalias i8* @malloc(i64 4194304) #4
-  br label %for.body.i
-
-for.body.i:                                       ; preds = %for.body.i, %entry
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %for.body.i ], [ 0, %entry ]
-  %call.i = tail call i32 @rand() #4
-  %conv.i = sitofp i32 %call.i to float
-  %div.i = fmul float %conv.i, 0x3E00000000000000
-  %arrayidx.i = getelementptr inbounds float* %0, i64 %indvars.iv.i
-  store float %div.i, float* %arrayidx.i, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv59 = trunc i64 %indvars.iv.next.i to i32
-  %exitcond60 = icmp eq i32 %lftr.wideiv59, 1048576
-  br i1 %exitcond60, label %for.body.i40.preheader, label %for.body.i
-
-for.body.i40.preheader:                           ; preds = %for.body.i
-  %1 = bitcast i8* %call7 to float*
-  br label %for.body.i40
-
-for.body.i40:                                     ; preds = %for.body.i40.preheader, %for.body.i40
-  %indvars.iv.i32 = phi i64 [ %indvars.iv.next.i37, %for.body.i40 ], [ 0, %for.body.i40.preheader ]
-  %call.i33 = tail call i32 @rand() #4
-  %conv.i34 = sitofp i32 %call.i33 to float
-  %div.i35 = fmul float %conv.i34, 0x3E00000000000000
-  %arrayidx.i36 = getelementptr inbounds float* %1, i64 %indvars.iv.i32
-  store float %div.i35, float* %arrayidx.i36, align 4, !tbaa !0
-  %indvars.iv.next.i37 = add i64 %indvars.iv.i32, 1
-  %lftr.wideiv57 = trunc i64 %indvars.iv.next.i37 to i32
-  %exitcond58 = icmp eq i32 %lftr.wideiv57, 1048576
-  br i1 %exitcond58, label %randomInit.exit41, label %for.body.i40
-
-randomInit.exit41:                                ; preds = %for.body.i40
-  %call12 = tail call noalias i8* @malloc(i64 4194304) #4
-  %2 = bitcast i8* %call12 to float*
-  
-  ; ---------------------------------- Adding VISC Launch Call --------------------------------
-  ; Setting up launch input args
-  %in.addr = alloca %struct.arg
-
-  ; Store arguments
-  %in.addr.h_A = getelementptr %struct.arg* %in.addr, i32 0, i32 0
-  %in.addr.bytes_A = getelementptr %struct.arg* %in.addr, i32 0, i32 1
-  %in.addr.h_B = getelementptr %struct.arg* %in.addr, i32 0, i32 2
-  %in.addr.bytes_B = getelementptr %struct.arg* %in.addr, i32 0, i32 3
-  %in.addr.h_C = getelementptr %struct.arg* %in.addr, i32 0, i32 4
-  %in.addr.bytes_C = getelementptr %struct.arg* %in.addr, i32 0, i32 5
-  %in.addr.WA = getelementptr %struct.arg* %in.addr, i32 0, i32 6
-  %in.addr.WB = getelementptr %struct.arg* %in.addr, i32 0, i32 7
-  %in.addr.HA = getelementptr %struct.arg* %in.addr, i32 0, i32 8
-
-  store float* %0, float** %in.addr.h_A
-  store i32 4194304, i32* %in.addr.bytes_A
-  store float* %1, float** %in.addr.h_B
-  store i32 4194304, i32* %in.addr.bytes_B
-  store float* %2, float** %in.addr.h_C
-  store i32 4194304, i32* %in.addr.bytes_C
-  store i32 1024, i32* %in.addr.WA
-  store i32 1024, i32* %in.addr.WB
-  store i32 1024, i32* %in.addr.HA
-
-  ; Change type to i8* and VISC Launch call
-  %args = bitcast %struct.arg* %in.addr to i8*
-  %graphID = call i8* @llvm.visc.launch(i8* bitcast (%rtype (float*, i32, float*, i32, float*, i32, i32, i32, i32)* @MatrixMulRoot to i8*), i8* %args)
-  ;tail call void @computeMatrixMul(float* %0, i32 4194304, float* %1, i32 4194304, float* %2, i32 4194304)
-
-  ; Wait for result
-  call void @llvm.visc.wait(i8* %graphID)
-
-  ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
-  %out = load %rtype* %out.addr
-  %out.h_C = extractvalue %rtype %out, 0
-  ;%2 = extractvalue %rtype %out, 0
-  %out.bytes_C = extractvalue %rtype %out, 1
-
-  ; -------------------------------- Completed VISC Launch Call --------------------------------
-  
-  br label %for.cond4.preheader.i
-
-for.cond4.preheader.i:                            ; preds = %for.inc50.i, %randomInit.exit41
-  %indvars.iv92.i = phi i64 [ 0, %randomInit.exit41 ], [ %indvars.iv.next93.i, %for.inc50.i ]
-  %i.081.i = phi i32 [ 0, %randomInit.exit41 ], [ %inc51.i, %for.inc50.i ]
-  %3 = shl nsw i64 %indvars.iv92.i, 10
-  br label %for.body7.i
-
-for.cond4.i:                                      ; preds = %for.end.i
-  %inc48.i = add nsw i32 %j.079.i, 1
-  %4 = trunc i64 %indvars.iv.next89.i to i32
-  %cmp5.i = icmp slt i32 %4, 1024
-  br i1 %cmp5.i, label %for.body7.i, label %for.inc50.i
-
-for.body7.i:                                      ; preds = %for.cond4.i, %for.cond4.preheader.i
-  %indvars.iv88.i = phi i64 [ 0, %for.cond4.preheader.i ], [ %indvars.iv.next89.i, %for.cond4.i ]
-  %j.079.i = phi i32 [ 0, %for.cond4.preheader.i ], [ %inc48.i, %for.cond4.i ]
-  br label %for.body12.i
-
-for.body12.i:                                     ; preds = %for.body12.i, %for.body7.i
-  %indvars.iv.i42 = phi i64 [ 0, %for.body7.i ], [ %indvars.iv.next.i43, %for.body12.i ]
-  %5 = phi float [ 0.000000e+00, %for.body7.i ], [ %add26.i, %for.body12.i ]
-  %6 = add nsw i64 %indvars.iv.i42, %3
-  %arrayidx16.i = getelementptr inbounds float* %0, i64 %6
-  %7 = load float* %arrayidx16.i, align 4, !tbaa !0
-  %8 = shl i64 %indvars.iv.i42, 10
-  %9 = add nsw i64 %8, %indvars.iv88.i
-  %arrayidx20.i = getelementptr inbounds float* %1, i64 %9
-  %10 = load float* %arrayidx20.i, align 4, !tbaa !0
-  %mul21.i = fmul float %7, %10
-  %add26.i = fadd float %5, %mul21.i
-  %indvars.iv.next.i43 = add i64 %indvars.iv.i42, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next.i43 to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 1024
-  br i1 %exitcond, label %for.end.i, label %for.body12.i
-
-for.end.i:                                        ; preds = %for.body12.i
-  %11 = add nsw i64 %indvars.iv88.i, %3
-  ; Replace use of %2 with %out.h_C
-  ;%arrayidx34.i = getelementptr inbounds float* %2, i64 %11
-  %arrayidx34.i = getelementptr inbounds float* %out.h_C, i64 %11
-  %12 = load float* %arrayidx34.i, align 4, !tbaa !0
-  %sub.i.i = fsub float %add26.i, %12
-  %fabsf.i.i = tail call float @fabsf(float %sub.i.i) #6
-  %13 = fpext float %fabsf.i.i to double
-  %cmp.i.i = fcmp olt double %13, 1.000000e-03
-  %indvars.iv.next89.i = add i64 %indvars.iv88.i, 1
-  br i1 %cmp.i.i, label %for.cond4.i, label %if.else
-
-for.inc50.i:                                      ; preds = %for.cond4.i
-  %indvars.iv.next93.i = add i64 %indvars.iv92.i, 1
-  %inc51.i = add nsw i32 %i.081.i, 1
-  %14 = trunc i64 %indvars.iv.next93.i to i32
-  %cmp.i = icmp slt i32 %14, 1024
-  br i1 %cmp.i, label %for.cond4.preheader.i, label %if.then
-
-if.then:                                          ; preds = %for.inc50.i
-  %puts31 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str27, i64 0, i64 0))
-  br label %if.end
-
-if.else:                                          ; preds = %for.end.i
-  %conv40.i = fpext float %12 to double
-  %conv45.i = fpext float %add26.i to double
-  %call46.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @.str1, i64 0, i64 0), i32 %i.081.i, i32 %j.079.i, double %conv40.i, double %conv45.i) #4
-  %puts = tail call i32 @puts(i8* getelementptr inbounds ([9 x i8]* @str, i64 0, i64 0))
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %puts30 = tail call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @str26, i64 0, i64 0))
-  tail call void @free(i8* %call) #4
-  tail call void @free(i8* %call7) #4
-  tail call void @free(i8* %call12) #4
-  ret i32 0
-}
-
-
-; Function Attrs: nounwind
-declare void @srand(i32) #1
-
-; Function Attrs: noreturn nounwind
-declare void @exit(i32) #5
-
-declare float @fabsf(float)
-
-; Function Attrs: nounwind
-declare i32 @puts(i8* nocapture) #4
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noreturn nounwind }
-
-!0 = metadata !{metadata !"float", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"long", metadata !1}
-!4 = metadata !{metadata !"int", metadata !1}
-!5 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/Makefile b/hpvm/test/gemm_opencl/matrixMul_bc/Makefile
deleted file mode 100644
index 1984e14c78f3fabd3f2b92a98b81919dbaf2b979..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_bc/Makefile
+++ /dev/null
@@ -1,27 +0,0 @@
-PASSES :=
-
-.PHONY: clean
-
-LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
-LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
-HOST:=sgemm
-KERNELS:=matrixMul_bc
-LLVM_CC:=$(LLVM_INSTALL)/bin/clang
-LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
-
-all: $(KERNELS:%=%.ll) $(HOST:%=%.ll)
-
-$(KERNELS:%=%.ll):%.ll:%.cl
-	$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
-
-$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
-	$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
-
-$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
-	$(LLVM_CC) -target nvptx $< -S -o $@
-
-$(HOST:%=%.ll):%.ll:%.c
-	$(LLVM_CC) -O3 -S -emit-llvm $< -o $@
-
-clean :
-	rm -f *.ll *.bc *.s
diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul_bc/gemm_opencl.c
deleted file mode 100644
index 31cd7502ea360592ea845c9705de2568f35d6de9..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_bc/gemm_opencl.c
+++ /dev/null
@@ -1,192 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-//#include <oclUtils.h>
-#include <CL/cl.h>
-
-#define WA 1024
-#define HA 1024
-#define WB 1024
-#define HB WA
-#define WC WB
-#define HC HA
-
-// Thread block size
-#define BLOCK_SIZE 16
-
-inline void checkErr(cl_int err, cl_int success, const char * name) {
-  if (err != success) {
-    fprintf(stderr, "ERROR: %s\n", name);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
- 
-// Main
-int main(int argc, char** argv) {
-
-  // seed for rand()
-  srand(2006);
- 
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
-  unsigned int bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
-
-  unsigned int size_B = WB * HB;
-  unsigned int bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
- 
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/* 
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
- 
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
-
-  // Allocate host memory for the result matrix C
-  unsigned int size_C = WC * HC;
-  unsigned int bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
- 
-   // Initialize OpenCL
-
-   // OpenCL specific variables
-  cl_context clGPUContext;
-  cl_command_queue clCommandQue;
-  cl_program clProgram;
-  cl_kernel clKernel;
- 
-  size_t dataBytes;
-  size_t kernelLength;
-  cl_int errcode;
-
-  // OpenCL device memory for matrices
-  cl_mem d_A;
-  cl_mem d_B;
-  cl_mem d_C;
-
-  /*****************************************/
-  /* Initialize OpenCL */
-  /*****************************************/
-  clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,
-                                         NULL, NULL, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  // get the list of GPU devices associated with context
-  errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0,
-                              NULL, &dataBytes);
-  cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
-  errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, 
-                              clDevices, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  //Create a command-queue
-  clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-  
-  // Setup device memory
-  d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
-                       &errcode);
-  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
-                       bytes_A, h_A, &errcode);
-  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
-                       bytes_B, h_B, &errcode);
- 
- 
-   // Load and build OpenCL kernel
-  char *clMatrixMul = oclLoadProgSource("kernel.cl",
-                                        "// My comment\n", 
-                                        &kernelLength);
-  shrCheckError(clMatrixMul != NULL, CL_SUCCESS);
- 
-  clProgram = clCreateProgramWithSource(clGPUContext, 1,
-                                        (const char **)&clMatrixMul, 
-                                        &kernelLength, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  clKernel = clCreateKernel(clProgram, "matrixMul", &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-
-  // Launch OpenCL kernel
-  size_t localWorkSize[2], globalWorkSize[2];
- 
-  int wA = WA;
-  int wC = WC;
-  errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
-  errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
-  errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
-  errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA);
-  errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  localWorkSize[0] = BLOCK_SIZE;
-  localWorkSize[1] = BLOCK_SIZE;
-  globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
-  globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
- 
-  errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
-                                   globalWorkSize, localWorkSize,
-                                   0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  // Retrieve result from device
-  errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
-                                h_C, 0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  // Print out the result
-/*
-  printf("\n\nMatrix C (Result)\n");
-  for(int i = 0; i < size_C; i++) {
-    printf("%f ", h_C[i]);
-    if(((i + 1) % WC) == 0)
-    printf("\n");
-  }
-  printf("\n");
- */
-  printf("\nDone!\n");
-
-  // Deallocate memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
-
-  clReleaseMemObject(d_A);
-  clReleaseMemObject(d_C);
-  clReleaseMemObject(d_B);
-
-  free(clDevices);
-  free(clMatrixMul);
-  clReleaseContext(clGPUContext);
-  clReleaseKernel(clKernel);
-  clReleaseProgram(clProgram);
-  clReleaseCommandQueue(clCommandQue);
-
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/matrixMul_bc.cl b/hpvm/test/gemm_opencl/matrixMul_bc/matrixMul_bc.cl
deleted file mode 100644
index 64c52ff5b8d5afd7c35a73cfca3a6587565810cd..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_bc/matrixMul_bc.cl
+++ /dev/null
@@ -1,30 +0,0 @@
-// Thread block size
-#define TILE_SIZE 16
-
-// OpenCL Kernel for matrix multiply, C = A * B
-// with boundary conditions
-__kernel void matrixMul(__global float* C,
-                        __global float* A,
-                        __global float* B,
-                        int m,
-                        int k,
-                        int n) {
-
-  int tx = get_global_id(0); //2D Global Thread ID x
-  int ty = get_global_id(1); //2D Global Thread ID y
-
-  if ((tx < n) && (ty < m)) {
-    // Initialize accumulator
-    float res = 0.0f;
-
-    // Perform dot-product of row-column
-    for (int i = 0; i < k; i++) {
-      res += A[ty*k+i] * B[i*n+tx];
-    }
-
-    // Write in device memory
-    C[ty*n+tx] = res;
-  }
-
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_bc/sgemm.c b/hpvm/test/gemm_opencl/matrixMul_bc/sgemm.c
deleted file mode 100644
index 1cf29a212c7eddab4f0e7c82da832f1a3f589e6a..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_bc/sgemm.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/* 
- * Main entry of vector addition kernel
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void matrixMultiply(float *C, float *A, float *B, int m, int k, int n);
-
-/*
-// Host matrix multiply
-void matrixMulHost (int m, k, n, int *A, int *B, int *C) {
-
-  for (int i = 0; i < m; i++)
-  for (int j = 0; j < n; j++)
-  for (int t = 0; t < k; t++)
-    C[i*n + j] = A[i*k + t] + B[t*n + j];
-
-  return;
-}
-*/
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
-
-int main (int argc, char *argv[]) {
-  int m = atoi(argv[1]);
-  int k = atoi(argv[2]);
-  int n = atoi(argv[3]);
-
-  float *A, *B, *C;
-
-  /********************************************************************
-  Allocate memory and initialize the input/output vectors
-  ********************************************************************/
-
-  A = (float *) malloc(m*k*sizeof(float));
-  B = (float *) malloc(k*n*sizeof(float));
-  C = (float *) malloc(m*n*sizeof(float));
-
-  randomInit(A, m*k);
-  randomInit(B, k*n);
-
-  matrixMultiply(C, A, B, m, k, n);
-    
-  /********************************************************************
-  Free memory allocations
-  ********************************************************************/
-
-  free(A); free(B); free(C);
-
-  return 0;
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/Makefile b/hpvm/test/gemm_opencl/matrixMul_sh/Makefile
deleted file mode 100644
index 109208976556eaa52a56eb733cee3cfd13e245eb..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-PASSES := 
-
-.PHONY: clean
-
-LIBCLC:=/home/kotsifa2/llvm/libclc
-HOST:=sgemm
-KERNELS:=matrixMul_sh
-
-all: $(KERNELS:%=%.ll) $(HOST:%=%.ll)
-
-$(KERNELS:%=%.ll):%.ll:%.cl
-	clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
-
-$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
-	llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
-
-$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
-	clang -target nvptx $< -S -o $@
-
-$(HOST:%=%.ll):%.ll:%.c
-	clang -O3 -S -emit-llvm $< -o $@
-
-clean :
-	rm -f *.ll *.bc *.s
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul_sh/gemm_opencl.c
deleted file mode 100644
index 2c41a20814f82473802e1efc54e82dbad59ee02d..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh/gemm_opencl.c
+++ /dev/null
@@ -1,193 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <oclUtils.h>
-
-#define WA 1024
-#define HA 1024
-#define WB 1024
-#define HB WA
-#define WC WB
-#define HC HA
-
-// Thread block size
-#define BLOCK_SIZE 16
-
-inline void checkErr(cl_int err, cl_int success, const char * name) {
-  if (err != success) {
-    fprintf(stderr, "ERROR: %s\n", name);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
- 
-// Main
-int main(int argc, char** argv) {
-
-  // seed for rand()
-  srand(2006);
- 
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
-  unsigned int bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
-
-  unsigned int size_B = WB * HB;
-  unsigned int bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
- 
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/* 
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
- 
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
-
-  // Allocate host memory for the result matrix C
-  unsigned int size_C = WC * HC;
-  unsigned int bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
- 
-   // Initialize OpenCL
-
-   // OpenCL specific variables
-  cl_context clGPUContext;
-  cl_command_queue clCommandQue;
-  cl_program clProgram;
-  cl_kernel clKernel;
- 
-  size_t dataBytes;
-  size_t kernelLength;
-  cl_int errcode;
-
-  // OpenCL device memory for matrices
-  cl_mem d_A;
-  cl_mem d_B;
-  cl_mem d_C;
-
-  /*****************************************/
-  /* Initialize OpenCL */
-  /*****************************************/
-  clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,
-                                         NULL, NULL, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  // get the list of GPU devices associated with context
-  errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0,
-                              NULL, &dataBytes);
-  cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
-  errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, 
-                              clDevices, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  //Create a command-queue
-  clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-  
-  // Setup device memory
-  d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
-                       &errcode);
-  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
-                       bytes_A, h_A, &errcode);
-  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
-                       bytes_B, h_B, &errcode);
- 
- 
-   // Load and build OpenCL kernel
-  char *clMatrixMul = oclLoadProgSource("kernel.cl",
-                                        "// My comment\n", 
-                                        &kernelLength);
-  shrCheckError(clMatrixMul != NULL, shrTRUE);
- 
-  clProgram = clCreateProgramWithSource(clGPUContext, 1,
-                                        (const char **)&clMatrixMul, 
-                                        &kernelLength, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  clKernel = clCreateKernel(clProgram, "matrixMul", &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-
-  // Launch OpenCL kernel
-  size_t localWorkSize[2], globalWorkSize[2];
- 
-  int hA = HA;
-  int wA = WA;
-  int wC = WC;
-  errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
-  errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
-  errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
-  errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&hA);
-  errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wA);
-  errcode |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&wC);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  localWorkSize[0] = BLOCK_SIZE;
-  localWorkSize[1] = BLOCK_SIZE;
-  globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
-  globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
- 
-  errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
-                                   globalWorkSize, localWorkSize,
-                                   0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  // Retrieve result from device
-  errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
-                                h_C, 0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  // Print out the result
-/*
-  printf("\n\nMatrix C (Result)\n");
-  for(int i = 0; i < size_C; i++) {
-    printf("%f ", h_C[i]);
-    if(((i + 1) % WC) == 0)
-    printf("\n");
-  }
-  printf("\n");
- */
-  printf("\nDone!\n");
-
-  // Deallocate memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
-
-  clReleaseMemObject(d_A);
-  clReleaseMemObject(d_C);
-  clReleaseMemObject(d_B);
-
-  free(clDevices);
-  free(clMatrixMul);
-  clReleaseContext(clGPUContext);
-  clReleaseKernel(clKernel);
-  clReleaseProgram(clProgram);
-  clReleaseCommandQueue(clCommandQue);
-
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/matrixMul_sh.cl b/hpvm/test/gemm_opencl/matrixMul_sh/matrixMul_sh.cl
deleted file mode 100644
index db89aba5a37b87af73d33560a120b125f9f1a921..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh/matrixMul_sh.cl
+++ /dev/null
@@ -1,54 +0,0 @@
-// Tile size
-#define TILE_SIZE 16
-
-// OpenCL Kernel for matrix multiply, C = A * B
-// using shared memory
-__kernel void matrixMul_sh(__global float* C,
-                           __global float* A,
-                           __global float* B,
-                           int k,
-                           int n) {
-
-  int tx = get_local_id(0); //2D Local Thread ID x
-  int ty = get_local_id(1); //2D Local Thread ID y
-
-  int bx = get_group_id(0); //2D Block ID x
-  int by = get_group_id(1); //2D Block ID y
-
-  int col = bx * TILE_SIZE + tx;
-  int row = by * TILE_SIZE + ty;
-
-  // Static work-group (thread block) local (shared) memory allocations
-  __local float A_s[TILE_SIZE][TILE_SIZE];
-  __local float B_s[TILE_SIZE][TILE_SIZE];
-
-  // Initialize accumulator
-  float res = 0.0f;
-
-  int i,l;
-
-  for (l = 0; l < k/TILE_SIZE; l++) {
-    // Transfer tiles of A and B to local (shared) memory
-    A_s[ty][tx] = A[row*k + TILE_SIZE*l + tx];
-    B_s[ty][tx] = B[(TILE_SIZE*l + ty)*n + col];
-
-    // Barrier to synchronize all threads
-    barrier(CLK_LOCAL_MEM_FENCE);
-    // Now the local submatricies A_s and B_s are valid
-
-    /* Multiply the two submatrices. Each thread computes one element of the *
-     * block submatrix                                                       */
-    for (i = 0; i < TILE_SIZE; i++)
-      res += A_s[ty][i]*B_s[i][tx];
-
-    /* Barrier: calculations must be completed before next memory transfer    *
-     * can start                                                              */
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-  }
-
-  // Write in device memory
-  C[row * n + col] = res;
-
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh/sgemm.c b/hpvm/test/gemm_opencl/matrixMul_sh/sgemm.c
deleted file mode 100644
index c1c3a300668b94f904393074bb92874715ac5e25..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh/sgemm.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/* 
- * Main entry of vector addition kernel
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void matrixMultiply(float *C, float *A, float *B, int k, int n);
-
-/*
-// Host matrix multiply
-void matrixMulHost (int m, k, n, int *A, int *B, int *C) {
-
-  for (int i = 0; i < m; i++)
-  for (int j = 0; j < n; j++)
-  for (int t = 0; t < k; t++)
-    C[i*n + j] = A[i*k + t] + B[t*n + j];
-
-  return;
-}
-*/
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
-
-int main (int argc, char *argv[]) {
-  int m = atoi(argv[1]);
-  int k = atoi(argv[2]);
-  int n = atoi(argv[3]);
-
-  float *A, *B, *C;
-
-  /********************************************************************
-  Allocate memory and initialize the input/output vectors
-  ********************************************************************/
-
-  A = (float *) malloc(m*k*sizeof(float));
-  B = (float *) malloc(k*n*sizeof(float));
-  C = (float *) malloc(m*n*sizeof(float));
-
-  randomInit(A, m*k);
-  randomInit(B, k*n);
-
-  matrixMultiply(C, A, B, k, n);
-    
-  /********************************************************************
-  Free memory allocations
-  ********************************************************************/
-
-  free(A); free(B); free(C);
-
-  return 0;
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/Makefile b/hpvm/test/gemm_opencl/matrixMul_sh_bc/Makefile
deleted file mode 100644
index 89793871e5966a0cd45ea25fe9a18d4941b8ef65..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/Makefile
+++ /dev/null
@@ -1,24 +0,0 @@
-PASSES := 
-
-.PHONY: clean
-
-LIBCLC:=/home/kotsifa2/llvm/libclc
-HOST:=sgemm
-KERNELS:=matrixMul_sh_bc
-
-all: $(KERNELS:%=%.ll) $(HOST:%=%.ll)
-
-$(KERNELS:%=%.ll):%.ll:%.cl
-	clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
-
-$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
-	llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
-
-$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
-	clang -target nvptx $< -S -o $@
-
-$(HOST:%=%.ll):%.ll:%.c
-	clang -O3 -S -emit-llvm $< -o $@
-
-clean :
-	rm -f *.ll *.bc *.s
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/gemm_opencl.c b/hpvm/test/gemm_opencl/matrixMul_sh_bc/gemm_opencl.c
deleted file mode 100644
index 2c41a20814f82473802e1efc54e82dbad59ee02d..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/gemm_opencl.c
+++ /dev/null
@@ -1,193 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <oclUtils.h>
-
-#define WA 1024
-#define HA 1024
-#define WB 1024
-#define HB WA
-#define WC WB
-#define HC HA
-
-// Thread block size
-#define BLOCK_SIZE 16
-
-inline void checkErr(cl_int err, cl_int success, const char * name) {
-  if (err != success) {
-    fprintf(stderr, "ERROR: %s\n", name);
-    exit(EXIT_FAILURE);
-  }
-}
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
- 
-// Main
-int main(int argc, char** argv) {
-
-  // seed for rand()
-  srand(2006);
- 
-  // Allocate host memory for matrices A and B
-  unsigned int size_A = WA * HA;
-  unsigned int bytes_A = sizeof(float) * size_A;
-  float* h_A = (float*) malloc(bytes_A);
-
-  unsigned int size_B = WB * HB;
-  unsigned int bytes_B = sizeof(float) * size_B;
-  float* h_B = (float*) malloc(bytes_B);
- 
-   // Initialize host memory
-   randomInit(h_A, size_A);
-   randomInit(h_B, size_B);
-
-/* 
-   // Print A and B
-   printf("\n\nMatrix A\n");
-   for(int i = 0; i < size_A; i++)
-   {
-      printf("%f ", h_A[i]);
-      if(((i + 1) % WA) == 0)
-      printf("\n");
-   }
- 
-   printf("\n\nMatrix B\n");
-   for(int i = 0; i < size_B; i++)
-   {
-      printf("%f ", h_B[i]);
-      if(((i + 1) % WB) == 0)
-      printf("\n");
-   }
-*/
-
-  // Allocate host memory for the result matrix C
-  unsigned int size_C = WC * HC;
-  unsigned int bytes_C = sizeof(float) * size_C;
-  float* h_C = (float*) malloc(bytes_C);
- 
-   // Initialize OpenCL
-
-   // OpenCL specific variables
-  cl_context clGPUContext;
-  cl_command_queue clCommandQue;
-  cl_program clProgram;
-  cl_kernel clKernel;
- 
-  size_t dataBytes;
-  size_t kernelLength;
-  cl_int errcode;
-
-  // OpenCL device memory for matrices
-  cl_mem d_A;
-  cl_mem d_B;
-  cl_mem d_C;
-
-  /*****************************************/
-  /* Initialize OpenCL */
-  /*****************************************/
-  clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,
-                                         NULL, NULL, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  // get the list of GPU devices associated with context
-  errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0,
-                              NULL, &dataBytes);
-  cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
-  errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, 
-                              clDevices, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  //Create a command-queue
-  clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-  
-  // Setup device memory
-  d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
-                       &errcode);
-  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
-                       bytes_A, h_A, &errcode);
-  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
-                       bytes_B, h_B, &errcode);
- 
- 
-   // Load and build OpenCL kernel
-  char *clMatrixMul = oclLoadProgSource("kernel.cl",
-                                        "// My comment\n", 
-                                        &kernelLength);
-  shrCheckError(clMatrixMul != NULL, shrTRUE);
- 
-  clProgram = clCreateProgramWithSource(clGPUContext, 1,
-                                        (const char **)&clMatrixMul, 
-                                        &kernelLength, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-  errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  clKernel = clCreateKernel(clProgram, "matrixMul", &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-
-
-  // Launch OpenCL kernel
-  size_t localWorkSize[2], globalWorkSize[2];
- 
-  int hA = HA;
-  int wA = WA;
-  int wC = WC;
-  errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C);
-  errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
-  errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
-  errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&hA);
-  errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wA);
-  errcode |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&wC);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  localWorkSize[0] = BLOCK_SIZE;
-  localWorkSize[1] = BLOCK_SIZE;
-  globalWorkSize[0] = ((WB-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
-  globalWorkSize[1] = ((HA-1)/BLOCK_SIZE + 1) * BLOCK_SIZE;
- 
-  errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
-                                   globalWorkSize, localWorkSize,
-                                   0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  // Retrieve result from device
-  errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
-                                h_C, 0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
-  // Print out the result
-/*
-  printf("\n\nMatrix C (Result)\n");
-  for(int i = 0; i < size_C; i++) {
-    printf("%f ", h_C[i]);
-    if(((i + 1) % WC) == 0)
-    printf("\n");
-  }
-  printf("\n");
- */
-  printf("\nDone!\n");
-
-  // Deallocate memory
-  free(h_A);
-  free(h_B);
-  free(h_C);
-
-  clReleaseMemObject(d_A);
-  clReleaseMemObject(d_C);
-  clReleaseMemObject(d_B);
-
-  free(clDevices);
-  free(clMatrixMul);
-  clReleaseContext(clGPUContext);
-  clReleaseKernel(clKernel);
-  clReleaseProgram(clProgram);
-  clReleaseCommandQueue(clCommandQue);
-
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/matrixMul_sh_bc.cl b/hpvm/test/gemm_opencl/matrixMul_sh_bc/matrixMul_sh_bc.cl
deleted file mode 100644
index ffa734eaa37eac8e85c4a884ce9d9f848b1ba970..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/matrixMul_sh_bc.cl
+++ /dev/null
@@ -1,92 +0,0 @@
-// Tile size
-#define TILE_SIZE 16
-
-// OpenCL Kernel for matrix multiply, C = A * B
-// using shared memory and with boundary conditions
-__kernel void matrixMul(__global float* C,
-                        __global float* A,
-                        __global float* B,
-                        int m,
-                        int k,
-                        int n) {
-
-  int tx = get_local_id(0); //2D Local Thread ID x
-  int ty = get_local_id(1); //2D Local Thread ID y
-
-  int bx = get_group_id(0); //2D Block ID x
-  int by = get_group_id(1); //2D Block ID y
-
-  int col = bx * TILE_SIZE + tx;
-  int row = by * TILE_SIZE + ty;
-
-  // Static work-group (thread block) local (shared) memory allocations
-  __local float A_s[TILE_SIZE][TILE_SIZE];
-  __local float B_s[TILE_SIZE][TILE_SIZE];
-
-  // Loop counters
-  int i, l;
-
-  // Initialize accumulator
-  float res = 0.0f;
-
-  /* In the tiled version of matrix multiplication for arbitrary sizes,       * 
-   * threads that are not matched with an element of the output matrix may    *
-   * still participate in the memory transfer                                 */
-
-  for (l = 0; l < k/TILE_SIZE; l++) {
-    /* Loop for calculating with interior tiles (as far as k is concerned) *
-     * Threads may still exceed the bottom limit of matrix A and the right *
-     * limit of matrix B                                                   */
-
-    /* check: thread does not exceed bottom limit of A */
-    if (row < m)
-        A_s[ty][tx] = A[row*k + TILE_SIZE*l + tx];
-
-    /* check: thread does not exceed right limit of B */
-    if (col < n)
-        B_s[ty][tx] = B[(TILE_SIZE*l + ty)*n + col];
-
-    /* Barrier: memory transfer must be completed before calculations start */
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* The threads that are within limits of output matrix accumulate in      *
-     * their local result                                                     */
-    if ((row < m) && (col < n))
-      for (i = 0; i < TILE_SIZE; i++)
-        res += A_s[ty][i]*B_s[i][tx];
-
-    /* Barrier: calculations must be over before next memory transfer starts */
-    barrier(CLK_LOCAL_MEM_FENCE);
-  }
-
-  /* For this last iteration, which will occur if matrices' sizes are not     *
-   * proportional to the tile size, we must take care not to exceed the right *
-   * limit of A and the bottom limit of B as well                             */
-
-  /* 1st check: thread does not exceed right limit of A  *
-   * 2st check: thread does not exceed bottom limit of A */
-  if ((TILE_SIZE*l + tx < k) && (row < m))
-    A_s[ty][tx] = A[row*k + TILE_SIZE*l + tx];
-
-  /* 1st check: thread does not exceed bottom limit of B *
-   * 2st check: thread does not exceed right limit of B  */
-  if ((TILE_SIZE*l + ty < k) && (col < n))
-    B_s[ty][tx] = B[(TILE_SIZE*l + ty)*n + col];
-
-  /* Barrier: memory transfer must be completed before calculations can start */
-  barrier(CLK_LOCAL_MEM_FENCE);
-
-  /* The threads that are within limits of output matrix accumulate in their *
-   * local result                                                            */
-  if ((row < m) && (col < n)) {
-    for (i = 0; i < min(TILE_SIZE, k - TILE_SIZE*l); i++)
-      res += A_s[ty][i]*B_s[i][tx];
-      /* Synchronization is not necessary, because each thread within limits *
-       * of output matrix copies back its own private result                 */
-      C[row*n + col] = res;
-  }
-
-  return;
-
-}
-
diff --git a/hpvm/test/gemm_opencl/matrixMul_sh_bc/sgemm.c b/hpvm/test/gemm_opencl/matrixMul_sh_bc/sgemm.c
deleted file mode 100644
index 1cf29a212c7eddab4f0e7c82da832f1a3f589e6a..0000000000000000000000000000000000000000
--- a/hpvm/test/gemm_opencl/matrixMul_sh_bc/sgemm.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/* 
- * Main entry of vector addition kernel
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void matrixMultiply(float *C, float *A, float *B, int m, int k, int n);
-
-/*
-// Host matrix multiply
-void matrixMulHost (int m, k, n, int *A, int *B, int *C) {
-
-  for (int i = 0; i < m; i++)
-  for (int j = 0; j < n; j++)
-  for (int t = 0; t < k; t++)
-    C[i*n + j] = A[i*k + t] + B[t*n + j];
-
-  return;
-}
-*/
-
-// Allocates a matrix with random float entries.
-void randomInit(float* data, int size) {
-  for (int i = 0; i < size; ++i)
-    data[i] = rand() / (float)RAND_MAX;
-}
-
-int main (int argc, char *argv[]) {
-  int m = atoi(argv[1]);
-  int k = atoi(argv[2]);
-  int n = atoi(argv[3]);
-
-  float *A, *B, *C;
-
-  /********************************************************************
-  Allocate memory and initialize the input/output vectors
-  ********************************************************************/
-
-  A = (float *) malloc(m*k*sizeof(float));
-  B = (float *) malloc(k*n*sizeof(float));
-  C = (float *) malloc(m*n*sizeof(float));
-
-  randomInit(A, m*k);
-  randomInit(B, k*n);
-
-  matrixMultiply(C, A, B, m, k, n);
-    
-  /********************************************************************
-  Free memory allocations
-  ********************************************************************/
-
-  free(A); free(B); free(C);
-
-  return 0;
-}
-
diff --git a/hpvm/test/singleVecNode/Makefile b/hpvm/test/singleVecNode/Makefile
deleted file mode 100644
index 5a13636aca764f63fbef04f57e73e7a41bfb394a..0000000000000000000000000000000000000000
--- a/hpvm/test/singleVecNode/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-TARGET := singleVecNode
-PASSES :=
-OPTS := -mllvm -vectorize-slp-aggressive
-SOURCEFILES := $(TARGET).ll $(TARGET).opt.ll
-
-.PHONY: clean
-
-all: $(SOURCEFILES)
-
-$(TARGET).opt.ll: $(TARGET).opt.bc
-	llvm-dis $< > $@
-
-$(TARGET).opt.bc: $(TARGET).bc
-	opt $(PASSES) $< > $@
-
-$(TARGET).bc: $(TARGET).ll
-	llvm-as $< > $@
-
-$(TARGET).ll: $(TARGET).c
-	clang -O3 -emit-llvm -S $(OPTS) $< -o $@
-
-clean :
-	rm -f $(SOURCEFILES) *.bc
diff --git a/hpvm/test/singleVecNode/singleVecNode.c b/hpvm/test/singleVecNode/singleVecNode.c
deleted file mode 100644
index 88b579b8caa1c929e553ea7c1c13391eb8c2f574..0000000000000000000000000000000000000000
--- a/hpvm/test/singleVecNode/singleVecNode.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Main entry of vector addition kernel
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-#define N 16
-
-int main (int argc, char *argv[]) {
-  float n = atof(argv[1]);
-
-  float a[N];
-  int i;
-  for(i = 0; i<N; i++) {
-    a[i] = n;
-  }
-  for(i=0; i<N;i++) {
-    a[i] = a[i] + i;
-  }
-  for(i=0; i<N;i++) {
-    printf("a[%d] = %f\n", i, a[i]);
-  }
-
-  return 0;
-}
-
diff --git a/hpvm/test/singleVecNode/visc_singleVecNode.ll b/hpvm/test/singleVecNode/visc_singleVecNode.ll
deleted file mode 100644
index 58d0a9a3c79a9ea5990ab2db2ebedab55a4bf929..0000000000000000000000000000000000000000
--- a/hpvm/test/singleVecNode/visc_singleVecNode.ll
+++ /dev/null
@@ -1,90 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@.str = private unnamed_addr constant [12 x i8] c"a[%d] = %f\0A\00", align 1
-
-declare i8* @llvm.visc.createNode(i8*)
-
-@llvm.visc.root = global i32 ()* @Root
-
-; Function for llvm.visc.root
-
-define i32 @Root() {
-  %init_node = call i8* @llvm.visc.createNode(i8* bitcast (i32 (i32, i8**)* @main to i8*))
-  ret i32 0
-}
-
-; Function Attrs: nounwind uwtable 
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-middle.block:
-  %a = alloca [16 x float], align 16
-  %arrayidx = getelementptr inbounds i8** %argv, i64 1
-  %0 = load i8** %arrayidx, align 8, !tbaa !0
-  %call.i = call double @strtod(i8* nocapture %0, i8** null) #1
-  %1 = bitcast [16 x float]* %a to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %1) #1
-  %conv = fptrunc double %call.i to float
-  %broadcast.splatinsert63 = insertelement <4 x float> undef, float %conv, i32 0
-  %broadcast.splat64 = shufflevector <4 x float> %broadcast.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer
-  %2 = bitcast [16 x float]* %a to <4 x float>*
-  store <4 x float> %broadcast.splat64, <4 x float>* %2, align 16
-  %3 = getelementptr inbounds [16 x float]* %a, i64 0, i64 4
-  %4 = bitcast float* %3 to <4 x float>*
-  store <4 x float> %broadcast.splat64, <4 x float>* %4, align 16
-  %5 = getelementptr inbounds [16 x float]* %a, i64 0, i64 8
-  %6 = bitcast float* %5 to <4 x float>*
-  store <4 x float> %broadcast.splat64, <4 x float>* %6, align 16
-  %7 = getelementptr inbounds [16 x float]* %a, i64 0, i64 12
-  %8 = bitcast float* %7 to <4 x float>*
-  store <4 x float> %broadcast.splat64, <4 x float>* %8, align 16
-  %wide.load = load <4 x float>* %2, align 16
-  %9 = fadd <4 x float> %wide.load, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-  store <4 x float> %9, <4 x float>* %2, align 16
-  %wide.load.1 = load <4 x float>* %4, align 16
-  %10 = fadd <4 x float> %wide.load.1, <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>
-  store <4 x float> %10, <4 x float>* %4, align 16
-  %wide.load.2 = load <4 x float>* %6, align 16
-  %11 = fadd <4 x float> %wide.load.2, <float 8.000000e+00, float 9.000000e+00, float 1.000000e+01, float 1.100000e+01>
-  store <4 x float> %11, <4 x float>* %6, align 16
-  %wide.load.3 = load <4 x float>* %8, align 16
-  %12 = fadd <4 x float> %wide.load.3, <float 1.200000e+01, float 1.300000e+01, float 1.400000e+01, float 1.500000e+01>
-  store <4 x float> %12, <4 x float>* %8, align 16
-  br label %for.body18
-
-for.body18:                                       ; preds = %for.body18, %middle.block
-  %indvars.iv = phi i64 [ 0, %middle.block ], [ %indvars.iv.next, %for.body18 ]
-  %arrayidx20 = getelementptr inbounds [16 x float]* %a, i64 0, i64 %indvars.iv
-  %13 = load float* %arrayidx20, align 4, !tbaa !3
-  %conv21 = fpext float %13 to double
-  %14 = trunc i64 %indvars.iv to i32
-  %call22 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str, i64 0, i64 0), i32 %14, double %conv21) #1
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 16
-  br i1 %exitcond, label %for.end25, label %for.body18
-
-for.end25:                                        ; preds = %for.body18
-  call void @llvm.lifetime.end(i64 64, i8* %1) #1
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-
-; Function Attrs: nounwind 
-declare i32 @printf(i8* nocapture, ...) #2
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-
-; Function Attrs: nounwind 
-declare double @strtod(i8*, i8** nocapture) #2
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"float", metadata !1}
diff --git a/hpvm/test/testKernel/kernel-spir32.ll b/hpvm/test/testKernel/kernel-spir32.ll
deleted file mode 100644
index e5c403f87b49073edfe691515d47d42b7846933c..0000000000000000000000000000000000000000
--- a/hpvm/test/testKernel/kernel-spir32.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; ModuleID = '/tmp/qt_temp.w24812'
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-target triple = "spir-unknown-unknown"
-
-define cc76 void @mysgemmNT(float addrspace(1)* nocapture %A, float addrspace(1)* nocapture %B, float addrspace(1)* nocapture %C) nounwind {
-  %1 = tail call cc75 i32 @_Z13get_global_idj(i32 0) nounwind readnone
-  %2 = getelementptr inbounds float addrspace(1)* %A, i32 %1
-  %3 = load float addrspace(1)* %2, align 4, !tbaa !9
-  %4 = getelementptr inbounds float addrspace(1)* %B, i32 %1
-  %5 = load float addrspace(1)* %4, align 4, !tbaa !9
-  %6 = fmul float %3, %5
-  %7 = getelementptr inbounds float addrspace(1)* %C, i32 %1
-  store float %6, float addrspace(1)* %7, align 4, !tbaa !9
-  ret void
-}
-
-declare cc75 i32 @_Z13get_global_idj(i32) nounwind readnone
-
-!opencl.kernels = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!7}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-
-!0 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*)* @mysgemmNT, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6}
-!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 1, i32 1}
-!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none"}
-!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"float*", metadata !"float*"}
-!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"const", metadata !"const", metadata !""}
-!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"float*", metadata !"float*", metadata !"float*"}
-!6 = metadata !{metadata !"kernel_arg_name", metadata !"A", metadata !"B", metadata !"C"}
-!7 = metadata !{i32 1, i32 2}
-!8 = metadata !{}
-!9 = metadata !{metadata !"float", metadata !10}
-!10 = metadata !{metadata !"omnipotent char", metadata !11}
-!11 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/testKernel/kernel-spir64.ll b/hpvm/test/testKernel/kernel-spir64.ll
deleted file mode 100644
index 9d97d957fa316110c5461cbd329e731491dc5c89..0000000000000000000000000000000000000000
--- a/hpvm/test/testKernel/kernel-spir64.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; ModuleID = '/tmp/qt_temp.w24812'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-target triple = "spir64-unknown-unknown"
-
-define cc76 void @mysgemmNT(float addrspace(1)* nocapture %A, float addrspace(1)* nocapture %B, float addrspace(1)* nocapture %C) nounwind {
-  %1 = tail call cc75 i64 @_Z13get_global_idj(i32 0) nounwind readnone
-  %sext = shl i64 %1, 32
-  %2 = ashr exact i64 %sext, 32
-  %3 = getelementptr inbounds float addrspace(1)* %A, i64 %2
-  %4 = load float addrspace(1)* %3, align 4, !tbaa !9
-  %5 = getelementptr inbounds float addrspace(1)* %B, i64 %2
-  %6 = load float addrspace(1)* %5, align 4, !tbaa !9
-  %7 = fmul float %4, %6
-  %8 = getelementptr inbounds float addrspace(1)* %C, i64 %2
-  store float %7, float addrspace(1)* %8, align 4, !tbaa !9
-  ret void
-}
-
-declare cc75 i64 @_Z13get_global_idj(i32) nounwind readnone
-
-!opencl.kernels = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!7}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-
-!0 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*)* @mysgemmNT, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6}
-!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 1, i32 1}
-!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none"}
-!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"float*", metadata !"float*"}
-!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"const", metadata !"const", metadata !""}
-!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"float*", metadata !"float*", metadata !"float*"}
-!6 = metadata !{metadata !"kernel_arg_name", metadata !"A", metadata !"B", metadata !"C"}
-!7 = metadata !{i32 1, i32 2}
-!8 = metadata !{}
-!9 = metadata !{metadata !"float", metadata !10}
-!10 = metadata !{metadata !"omnipotent char", metadata !11}
-!11 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/hpvm/test/testKernel/kernel.asm b/hpvm/test/testKernel/kernel.asm
deleted file mode 100644
index fe6cadaf41b3683ae89710711ea763f76096edf3..0000000000000000000000000000000000000000
--- a/hpvm/test/testKernel/kernel.asm
+++ /dev/null
@@ -1,83 +0,0 @@
-	.file	"main"
-	.text
-	.globl	mysgemmNT
-	.align	16, 0x90
-	.type	mysgemmNT,@function
-mysgemmNT:
-	.cfi_startproc
-	pushq	%rbp
-.Ltmp3:
-	.cfi_def_cfa_offset 16
-.Ltmp4:
-	.cfi_offset %rbp, -16
-	movq	%rsp, %rbp
-.Ltmp5:
-	.cfi_def_cfa_register %rbp
-	pushq	%r14
-	pushq	%rbx
-	andq	$-8, %rsp
-.Ltmp6:
-	.cfi_offset %rbx, -32
-.Ltmp7:
-	.cfi_offset %r14, -24
-	movq	(%rsi), %r9
-	movq	32(%rdi), %r8
-	movq	16(%rdi), %r14
-	movq	(%rdi), %rcx
-	movq	8(%rdi), %rdx
-	movq	80(%rdi), %r10
-	movq	%r10, %rsi
-	sarq	$2, %rsi
-	je	.LBB0_3
-	movl	%r10d, %eax
-	imull	%r9d, %eax
-	addl	%r8d, %eax
-	shlq	$32, %rax
-	movabsq	$17179869184, %r11
-	movq	%rsi, %rbx
-	.align	16, 0x90
-.LBB0_2:
-	movq	%rax, %rdi
-	sarq	$32, %rdi
-	vmovups	(%rdx,%rdi,4), %xmm0
-	vmulps	(%rcx,%rdi,4), %xmm0, %xmm0
-	vmovups	%xmm0, (%r14,%rdi,4)
-	addq	%r11, %rax
-	decq	%rbx
-	jne	.LBB0_2
-.LBB0_3:
-	movq	%r10, %rax
-	andq	$-4, %rax
-	cmpq	%rax, %r10
-	je	.LBB0_6
-	shlq	$2, %rsi
-	movq	%r10, %rdi
-	subq	%rsi, %rdi
-	negq	%rdi
-	imull	%r9d, %r10d
-	addl	%r10d, %r8d
-	addl	%r8d, %esi
-	shlq	$32, %rsi
-	movabsq	$4294967296, %r8
-	.align	16, 0x90
-.LBB0_5:
-	movq	%rsi, %rax
-	sarq	$32, %rax
-	vmovss	(%rcx,%rax,4), %xmm0
-	vmulss	(%rdx,%rax,4), %xmm0, %xmm0
-	vmovss	%xmm0, (%r14,%rax,4)
-	addq	%r8, %rsi
-	incq	%rdi
-	jne	.LBB0_5
-.LBB0_6:
-	leaq	-16(%rbp), %rsp
-	popq	%rbx
-	popq	%r14
-	popq	%rbp
-	ret
-.Ltmp8:
-	.size	mysgemmNT, .Ltmp8-mysgemmNT
-	.cfi_endproc
-
-
-	.section	".note.GNU-stack","",@progbits
diff --git a/hpvm/test/testKernel/kernel.cl b/hpvm/test/testKernel/kernel.cl
deleted file mode 100644
index 235a6498c4a01d43af7933f1bc2ef585a80fdedf..0000000000000000000000000000000000000000
--- a/hpvm/test/testKernel/kernel.cl
+++ /dev/null
@@ -1,18 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * Kernel of dense matrix-matrix multiplication kernel.
- */
-
-__kernel void mysgemmNT( __global const float *A, __global const float *B, __global float* C)
-{
-    int m = get_global_id(0);
- 
-    C[m] = A[m] * B[m];
-}
diff --git a/hpvm/test/testKernel/kernel.ll b/hpvm/test/testKernel/kernel.ll
deleted file mode 100644
index 03a29ff6979b9648bae22f3caf6722e102fd78dd..0000000000000000000000000000000000000000
--- a/hpvm/test/testKernel/kernel.ll
+++ /dev/null
@@ -1,154 +0,0 @@
-; ModuleID = 'main'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-pc-linux"
-
-; Function Attrs: nounwind
-declare void @__mysgemmNT_before.AddImplicitArgs(float addrspace(1)* nocapture, float addrspace(1)* nocapture, float addrspace(1)* nocapture) #0
-
-; Function Attrs: nounwind readnone
-declare i64 @_Z13get_global_idj(i32) #1
-
-declare [7 x i64] @__WG.boundaries.mysgemmNT_before.AddImplicitArgs(float addrspace(1)*, float addrspace(1)*, float addrspace(1)*)
-
-declare i64 @_Z14get_local_sizej(i32)
-
-declare i64 @get_base_global_id.(i32)
-
-declare i1 @__ocl_allOne(i1)
-
-declare i1 @__ocl_allZero(i1)
-
-; Function Attrs: alwaysinline nounwind
-declare void @__mysgemmNT_separated_args(float addrspace(1)* nocapture, float addrspace(1)* nocapture, float addrspace(1)* nocapture, i8 addrspace(3)* noalias, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }* noalias, i64* noalias, [4 x i64], i8* noalias, {}* noalias) #2
-
-declare [7 x i64] @WG.boundaries.mysgemmNT(float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i8 addrspace(3)* noalias, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }* noalias, i64* noalias, [4 x i64], i8* noalias, {}* noalias)
-
-define void @mysgemmNT(i8* noalias %pUniformArgs, i64* noalias %pWGId, {}* noalias %RuntimeHandle) {
-wrapper_entry:
-  %0 = bitcast i8* %pUniformArgs to float addrspace(1)**
-  %explicit_0 = load float addrspace(1)** %0, align 8
-  %1 = getelementptr i8* %pUniformArgs, i64 8
-  %2 = bitcast i8* %1 to float addrspace(1)**
-  %explicit_1 = load float addrspace(1)** %2, align 8
-  %3 = getelementptr i8* %pUniformArgs, i64 16
-  %4 = bitcast i8* %3 to float addrspace(1)**
-  %explicit_2 = load float addrspace(1)** %4, align 8
-  %5 = getelementptr i8* %pUniformArgs, i64 80
-  %6 = bitcast i8* %5 to i64*
-  %LocalSize_0 = load i64* %6, align 8
-  %7 = getelementptr i8* %pUniformArgs, i64 32
-  %8 = bitcast i8* %7 to i64*
-  %GlobalOffset_0 = load i64* %8, align 8
-  %GroupID_0 = load i64* %pWGId, align 8
-  %vector.size.i = ashr i64 %LocalSize_0, 2
-  %num.vector.wi.i = shl nsw i64 %vector.size.i, 2
-  %9 = icmp eq i64 %vector.size.i, 0
-  br i1 %9, label %scalarIf.i, label %dim_0_vector_pre_head.i.preheader
-
-dim_0_vector_pre_head.i.preheader:                ; preds = %wrapper_entry
-  %10 = mul i64 %LocalSize_0, %GroupID_0
-  %11 = add i64 %GlobalOffset_0, %10
-  %12 = mul i64 %11, 4294967296
-  br label %dim_0_vector_pre_head.i
-
-dim_0_vector_pre_head.i:                          ; preds = %dim_0_vector_pre_head.i.preheader, %dim_0_vector_pre_head.i
-  %lsr.iv5 = phi i64 [ %12, %dim_0_vector_pre_head.i.preheader ], [ %lsr.iv.next6, %dim_0_vector_pre_head.i ]
-  %lsr.iv3 = phi i64 [ %vector.size.i, %dim_0_vector_pre_head.i.preheader ], [ %lsr.iv.next4, %dim_0_vector_pre_head.i ]
-  %extractvector_func.i = ashr exact i64 %lsr.iv5, 32
-  %13 = getelementptr inbounds float addrspace(1)* %explicit_0, i64 %extractvector_func.i
-  %ptrTypeCastvector_func.i = bitcast float addrspace(1)* %13 to <4 x float> addrspace(1)*
-  %14 = load <4 x float> addrspace(1)* %ptrTypeCastvector_func.i, align 1
-  %15 = getelementptr inbounds float addrspace(1)* %explicit_1, i64 %extractvector_func.i
-  %ptrTypeCast5vector_func.i = bitcast float addrspace(1)* %15 to <4 x float> addrspace(1)*
-  %16 = load <4 x float> addrspace(1)* %ptrTypeCast5vector_func.i, align 1
-  %17 = fmul <4 x float> %14, %16
-  %18 = getelementptr inbounds float addrspace(1)* %explicit_2, i64 %extractvector_func.i
-  %ptrTypeCast6vector_func.i = bitcast float addrspace(1)* %18 to <4 x float> addrspace(1)*
-  store <4 x float> %17, <4 x float> addrspace(1)* %ptrTypeCast6vector_func.i, align 1
-  %lsr.iv.next4 = add i64 %lsr.iv3, -1
-  %lsr.iv.next6 = add i64 %lsr.iv5, 17179869184
-  %dim_0_vector_cmp.to.max.i = icmp eq i64 %lsr.iv.next4, 0
-  br i1 %dim_0_vector_cmp.to.max.i, label %scalarIf.i, label %dim_0_vector_pre_head.i
-
-scalarIf.i:                                       ; preds = %dim_0_vector_pre_head.i, %wrapper_entry
-  %19 = icmp eq i64 %LocalSize_0, %num.vector.wi.i
-  br i1 %19, label %__mysgemmNT_separated_args.exit, label %scalar_kernel_entry.i.preheader
-
-scalar_kernel_entry.i.preheader:                  ; preds = %scalarIf.i
-  %20 = mul i64 %vector.size.i, 4
-  %21 = sub i64 %LocalSize_0, %20
-  %22 = mul i64 %LocalSize_0, %GroupID_0
-  %23 = add i64 %GlobalOffset_0, %22
-  %24 = add i64 %23, %20
-  %25 = mul i64 %24, 4294967296
-  %26 = sub i64 0, %21
-  br label %scalar_kernel_entry.i
-
-scalar_kernel_entry.i:                            ; preds = %scalar_kernel_entry.i.preheader, %scalar_kernel_entry.i
-  %lsr.iv7 = phi i64 [ %26, %scalar_kernel_entry.i.preheader ], [ %lsr.iv.next8, %scalar_kernel_entry.i ]
-  %lsr.iv1 = phi i64 [ %25, %scalar_kernel_entry.i.preheader ], [ %lsr.iv.next2, %scalar_kernel_entry.i ]
-  %27 = ashr exact i64 %lsr.iv1, 32
-  %28 = getelementptr inbounds float addrspace(1)* %explicit_0, i64 %27
-  %29 = load float addrspace(1)* %28, align 1
-  %30 = getelementptr inbounds float addrspace(1)* %explicit_1, i64 %27
-  %31 = load float addrspace(1)* %30, align 1
-  %32 = fmul float %29, %31
-  %33 = getelementptr inbounds float addrspace(1)* %explicit_2, i64 %27
-  store float %32, float addrspace(1)* %33, align 1
-  %lsr.iv.next2 = add i64 %lsr.iv1, 4294967296
-  %lsr.iv.next8 = add i64 %lsr.iv7, 1
-  %dim_0_cmp.to.max.i = icmp eq i64 %lsr.iv.next8, 0
-  br i1 %dim_0_cmp.to.max.i, label %__mysgemmNT_separated_args.exit, label %scalar_kernel_entry.i
-
-__mysgemmNT_separated_args.exit:                  ; preds = %scalar_kernel_entry.i, %scalarIf.i
-  ret void
-}
-
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!opencl.kernels = !{!0}
-!opencl.enable.FP_CONTRACT = !{}
-!opencl.spir.version = !{!7}
-!opencl.ocl.version = !{!7}
-!opencl.used.extensions = !{!8}
-!opencl.used.optional.core.features = !{!8}
-!opencl.compiler.options = !{!8}
-!opencl.kernel_info = !{!9}
-!opencl.module_info_list = !{!26}
-!llvm.functions_info = !{}
-!opencl.functions_stats = !{}
-!opencl.stat_descriptions = !{}
-!opencl.module_stat_info = !{}
-
-!0 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i8 addrspace(3)*, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }*, i64*, [4 x i64], i8*, {}*)* @__mysgemmNT_separated_args, metadata !1, metadata !2, metadata !3, metadata !4, metadata !5, metadata !6}
-!1 = metadata !{metadata !"kernel_arg_addr_space", i32 1, i32 1, i32 1}
-!2 = metadata !{metadata !"kernel_arg_access_qual", metadata !"none", metadata !"none", metadata !"none"}
-!3 = metadata !{metadata !"kernel_arg_type", metadata !"float*", metadata !"float*", metadata !"float*"}
-!4 = metadata !{metadata !"kernel_arg_type_qual", metadata !"const", metadata !"const", metadata !""}
-!5 = metadata !{metadata !"kernel_arg_base_type", metadata !"float*", metadata !"float*", metadata !"float*"}
-!6 = metadata !{metadata !"kernel_arg_name", metadata !"A", metadata !"B", metadata !"C"}
-!7 = metadata !{i32 1, i32 2}
-!8 = metadata !{}
-!9 = metadata !{void (float addrspace(1)*, float addrspace(1)*, float addrspace(1)*, i8 addrspace(3)*, { i64, [3 x i64], [3 x i64], [2 x [3 x i64]], [3 x i64], {}*, {}* }*, i64*, [4 x i64], i8*, {}*)* @__mysgemmNT_separated_args, metadata !10}
-!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15, metadata !16, metadata !17, metadata !18, metadata !19, metadata !20, metadata !21, metadata !22, metadata !23, metadata !24, metadata !25}
-!11 = metadata !{metadata !"local_buffer_size", i32 0}
-!12 = metadata !{metadata !"barrier_buffer_size", i32 0}
-!13 = metadata !{metadata !"kernel_execution_length", i32 11}
-!14 = metadata !{metadata !"max_wg_dimensions", i32 1}
-!15 = metadata !{metadata !"kernel_has_barrier", i1 false}
-!16 = metadata !{metadata !"kernel_has_global_sync", i1 false}
-!17 = metadata !{metadata !"no_barrier_path", i1 true}
-!18 = metadata !{metadata !"vectorized_kernel", null}
-!19 = metadata !{metadata !"vectorized_width", i32 4}
-!20 = metadata !{metadata !"kernel_wrapper", void (i8*, i64*, {}*)* @mysgemmNT}
-!21 = metadata !{metadata !"scalarized_kernel", null}
-!22 = metadata !{metadata !"block_literal_size", null}
-!23 = metadata !{metadata !"private_memory_size", i32 0}
-!24 = metadata !{metadata !"vectorization_dimension", i32 0}
-!25 = metadata !{metadata !"can_unite_workgroups", i1 true}
-!26 = metadata !{metadata !27, metadata !28, metadata !29}
-!27 = metadata !{metadata !"global_variable_total_size", i64 0}
-!28 = metadata !{metadata !"gen_addr_space_pointer_counter", null}
-!29 = metadata !{metadata !"gen_addr_space_pointer_warnings"}
diff --git a/hpvm/test/vectorAdd/Makefile b/hpvm/test/vectorAdd/Makefile
deleted file mode 100644
index ecc4e035e87f67283563676aef7f516284055ef6..0000000000000000000000000000000000000000
--- a/hpvm/test/vectorAdd/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-PASSES := 
-SOURCEFILES := vecadd.ll vecadd.opt.ll
-
-.PHONY: clean
-
-all: $(SOURCEFILES)
-
-vecadd.opt.ll: vecadd.opt.bc
-	llvm-dis $< > $@
-
-vecadd.opt.bc: vecadd.bc
-	opt $(PASSES) $< > $@
-
-vecadd.bc: vecadd.ll
-	llvm-as $< > $@
-
-vecadd.ll: vecadd.c
-	clang -O3 -emit-llvm -S $< -o $@
-
-clean :
-	rm -f $(SOURCEFILES) *.bc
diff --git a/hpvm/test/vectorAdd/old_visc_vecadd.ll b/hpvm/test/vectorAdd/old_visc_vecadd.ll
deleted file mode 100644
index c88f20378b3708e0b13e7625c92ee8aa22b1997f..0000000000000000000000000000000000000000
--- a/hpvm/test/vectorAdd/old_visc_vecadd.ll
+++ /dev/null
@@ -1,250 +0,0 @@
-; ModuleID = 'vecadd.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%allocateResult_ty = struct { i32* }
-define void @allocateResult(i32 %size, %allocateResult_ty* %out) {
-  %sext = shl i64 %size, 32
-  %0 = ashr exact i64 %sext, 30
-  %1 = tail call noalias i8* @malloc(i64 %0) #3
-  %2 = bitcast i8* %1 to i32*
-
-  %ptr = getelementptr inbounds %allocateResult_ty* *%out, i64 0, i64 0
-  store i32* %2, %ptr
-  ret void
-}
-
-%vecadd_ty = struct { i32* }
-define void @vecadd(i32* %adata, i32* %bdata, i32* %cdata, %vecadd_ty* %out) {
-entry:
-  %node = call i8* @llvm.visc.getNode()
-  %idx = call i32 @llvm.VISC.getNodeInstanceID(%node)
-
-  %idxprom = sext i32 %idx to i64
-  %arrayidxa = getelementptr inbounds i32* %adata, i64 %idxprom
-  %a = load i32* %arrayidxa
-  %arrayidxb = getelementptr inbounds i32* %bdata, i64 %idxprom
-  %b = load i32* %arrayidxb
-
-  %add = add nsw i32 %a, %b
-  %arrayidxc = getelementptr inbounds i32* %cdata, i64 %idxprom
-  store i32 %add, i32* %arrayidxc, align 4
-
-  %ptr = getelementptr inbounds %vecadd_ty* *%out, i64 0, i64 0
-  store i32* %cdata, %ptr
-  ret void
-}
-
-%wrapperKernelFunction_ty = struct { i32* }
-define void @wrapperKernelFunction(i32 %n, i32* nocapture %A, i32* nocapture %B, %wrapperKernelFunction_ty* %out) {
-
-  %node = call i8* @llvm.visc.getNode()
-
-  %allocate_result_node = call i8* @llvm.visc.createNode(@allocateResult)
-  %vecadd_node = call i8* @llvm.visc.createNode1D(@vecadd, %n)
-
-  %edge0 = call i8* @llvm.visc.createEdge(%node, %allocate_result_node, @fmap4, @argmap4)
-  %edge1 = call i8* @llvm.visc.createEdge(%node, %vecadd_node, @fmap5, @argmap5)
-  %edge2 = call i8* @llvm.visc.createEdge(%allocate_result_node, %vecadd_node, @fmap6, @argmap6)
-  %edge3 = call i8* @llvm.visc.createEdge(%vecadd_node, %node, @fmap7, @argmap7)
-
-}
-
-; This function does not return a result. Is type of %out void* ?
-define i32 @cleanupFunction(i8* %A, i8* %B, i8* %C, void* %out) {
-  tail call void @free(i8* %A) #3
-  tail call void @free(i8* %B) #3
-  tail call void @free(i8* %C) #3
-  ret i32 0
-}
-
-%initializeFunction_ty = struct { i32, i32*, i32* }
-define i32 @initializeFunction(i8** %arg, %initializeFunction_ty* %out) {
-  %1 = getelementptr inbounds i8** %arg, i64 1
-  %2 = load i8** %1, align 8, !tbaa !0
-  %3 = tail call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #3
-  %4 = trunc i64 %3 to i32
-  %sext = shl i64 %3, 32
-  %5 = ashr exact i64 %sext, 30
-  %6 = tail call noalias i8* @malloc(i64 %5) #3
-  %7 = bitcast i8* %6 to i32*
-  %8 = tail call noalias i8* @malloc(i64 %5) #3
-  %9 = bitcast i8* %8 to i32*
-
-  %12 = icmp sgt i32 %4, 0
-  br i1 %12, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %13 = tail call i32 @rand() #3
-  %14 = srem i32 %13, 1000
-  %15 = getelementptr inbounds i32* %7, i64 %indvars.iv
-  store i32 %14, i32* %15, align 4, !tbaa !3
-  %16 = tail call i32 @rand() #3
-  %17 = srem i32 %16, 1000
-  %18 = getelementptr inbounds i32* %9, i64 %indvars.iv
-  store i32 %17, i32* %18, align 4, !tbaa !3
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %4
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  %ptrN = getelementptr inbounds %initializeFunction_ty* *%out, i64 0, i64 0
-  store i32* %4, %ptrN
-  %ptrA = getelementptr inbounds %initializeFunction_ty* *%out, i64 0, i64 1
-  store i32* %7, %ptrA
-  %ptrB = getelementptr inbounds %initializeFunction_ty* *%out, i64 0, i64 2
-  store i32* %9, %ptrB
-  ret i32 0
-} ;
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-
-  %node = call i8* @llvm.visc.getNode()
-
-  %init_node = call i8* @llvm.visc.createNode(@initializeFunction)
-  %kernel_node = call i8* @llvm.visc.createNode(@wrapperKernelFunction)
-  %cleanup_node = call i8* @llvm.visc.createNode(@cleanupFunction)
-
-  %edge0 = call i8* @llvm.visc.createEdge(%node, %init_node, @fmap0, @argmap0)
-  %edge1 = call i8* @llvm.visc.createEdge(%init_node, %kernel_node, @fmap1, @argmap1)
-  %edge2 = call i8* @llvm.visc.createEdge(%init_node, %cleanup_node, @fmap2, @argmap2)
-  %edge3 = call i8* @llvm.visc.createEdge(%kernel_node, %cleanup_node, @fmap3, @argmap3)
-
-  ret i32 0
-}
-
-
-; FUNCTIONS FOR EDGES
-
-; There is only one instance of both nodes, so this function is not needed
-define i1 @fmap0(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap0(i32 %i, i32 %j) {
-  %0 = icmp eq i32 %i, 1
-  %1 = icmp eq i32 %j, 0
-  %res = and i1 %0, %1
-  ret %res
-}
-
-; There is only one instance of both nodes, so this function is not needed
-define i1 @fmap1(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap1(i32 %i, i32 %j) {
-  %res = icmp eq i32 %i, %j
-  ret %res
-}
-
-; There is only one instance of both nodes, so this function is not needed
-define i1 @fmap2(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap2(i32 %i, i32 %j) {
-  %0 = icmp eq i32 %i, 1
-  %1 = icmp eq i32 %i, 2
-  %out_arg = and i1 %0, %1
-
-  %in_arg = add i32 %j, 1
-  %eq = icmp eq i32 %i, %j
-
-  %res = and i1 %out_arg, %eq
-  ret %res
-}
-
-; There is only one instance of both nodes, so this function is not needed
-define i1 @fmap3(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap3(i32 %i, i32 %j) {
-  %0 = icmp eq i32 %i, 0
-  %1 = icmp eq i32 %j, 2
-  %res = and i1 %0, %1
-  ret %res
-}
-
-; There is only one instance of both nodes, so this function is not needed
-define i1 @fmap4(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap4(i32 %i, i32 %j) {
-  %res = icmp eq i32 %i, %0
-  ret %res
-}
-
-; There is an edge from the unique instance of the source node to all the
-; instances of the destination node
-define i1 @fmap5(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap5(i32 %i, i32 %j) {
-  %0 = icmp eq i32 %i, 1
-  %1 = icmp eq i32 %i, 2
-  %out_arg = and i1 %0, %1
-
-  %in_arg = add i32 %j, 1
-  %eq = icmp eq i32 %i, %out_arg
-
-  %res = and i1 out_arg, %eq
-  ret %res
-}
-
-; There is an edge from the unique instance of the source node to all the
-; instances of the destination node
-define i1 @fmap6(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap6(i32 %i, i32 %j) {
-  %0 = icmp eq i32 %i, 0
-  %1 = icmp eq i32 %j, 2
-  %res = and i1 %0, %1
-  ret %res
-}
-
-; There are edges from all the instances of the source node to the unique
-; instance of the destination node
-define i1 @fmap7(i32 %i, i32 %j) {
-  ret true
-}
-
-define i1 @argmap7(i32 %i, i32 %j) {
-  ret true
-}
-
-declare i8* @llvm.visc.getNode() nounwind readnone
-
-declare i32 @llvm.VISC.getNodeInstanceID(i8*) nounwind readnone
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-declare void @vectorAdd(i32, i32*, i32*, i32*) #2
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare i64 @strtol(i8*, i8** nocapture, i32) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
-!0 = metadata !{metadata !"any pointer", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{metadata !"int", metadata !1}
diff --git a/hpvm/test/vectorAdd/vecadd.c b/hpvm/test/vectorAdd/vecadd.c
deleted file mode 100644
index ad214a394ab676101d0c2384686f05dccca8a2b1..0000000000000000000000000000000000000000
--- a/hpvm/test/vectorAdd/vecadd.c
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Main entry of vector addition kernel
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-
-extern void vectorAdd(int n, int *A, int *B, int *C) ;
-/*
-{
-
-  for (int i = 0; i < n; i++)
-    C[i] = A[i] + B[i];
-
-  return;
-}
-*/
-
-int main (int argc, char *argv[]) {
-  int n = atoi(argv[1]);
-
-  int *A, *B, *C;
-
-  /*******************************************************************
-   *                         Allocate memory                         *
-   *******************************************************************/
-initialize the input/output vectors
-  A = (int *) malloc(n*sizeof(int));
-  B = (int *) malloc(n*sizeof(int));
-  C = (int *) malloc(n*sizeof(int));
-
-  /*******************************************************************
-   *                      Initialize memory                          *
-   *******************************************************************/
-  for (int i = 0; i < n; i++) {
-    A[i] = rand() % 1000;
-    B[i] = rand() % 1000;
-  }
-
-  /*******************************************************************
-   *                         Kernel Call                             *
-   *******************************************************************/
-  vectorAdd(n, A, B, C);
-
-  /*******************************************************************
-   *                    Free allocated memory                        *
-   *******************************************************************/
-
-  free(A); free(B); free(C);
-
-  return 0;
-}
-
diff --git a/hpvm/test/vectorAdd/vecadd.ll b/hpvm/test/vectorAdd/vecadd.ll
deleted file mode 100644
index 73524b6e5763f928bd273d64046a1b1c94a131a1..0000000000000000000000000000000000000000
--- a/hpvm/test/vectorAdd/vecadd.ll
+++ /dev/null
@@ -1,212 +0,0 @@
-; ModuleID = 'vecadd.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-define void @vectorAdd(i32 %n, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) #0 {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0
-  %cnt.cast = zext i32 %n to i64
-  %n.vec = and i64 %cnt.cast, 4294967288
-  %cmp.zero = icmp eq i64 %n.vec, 0
-  %2 = add i32 %n, -1
-  %3 = zext i32 %2 to i64
-  %scevgep = getelementptr i32* %C, i64 %3
-  br i1 %cmp.zero, label %middle.block, label %vector.memcheck
-
-vector.memcheck:                                  ; preds = %.lr.ph
-  %scevgep8 = getelementptr i32* %B, i64 %3
-  %scevgep5 = getelementptr i32* %A, i64 %3
-  %bound111 = icmp uge i32* %scevgep, %B
-  %bound010 = icmp uge i32* %scevgep8, %C
-  %bound1 = icmp uge i32* %scevgep, %A
-  %bound0 = icmp uge i32* %scevgep5, %C
-  %found.conflict12 = and i1 %bound010, %bound111
-  %found.conflict = and i1 %bound0, %bound1
-  %conflict.rdx = or i1 %found.conflict, %found.conflict12
-  br i1 %conflict.rdx, label %middle.block, label %vector.body
-
-vector.body:                                      ; preds = %vector.memcheck, %vector.body
-  %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.memcheck ]
-  %4 = getelementptr inbounds i32* %A, i64 %index
-  %5 = bitcast i32* %4 to <4 x i32>*
-  %wide.load = load <4 x i32>* %5, align 4
-  %.sum21 = or i64 %index, 4
-  %6 = getelementptr i32* %A, i64 %.sum21
-  %7 = bitcast i32* %6 to <4 x i32>*
-  %wide.load14 = load <4 x i32>* %7, align 4
-  %8 = getelementptr inbounds i32* %B, i64 %index
-  %9 = bitcast i32* %8 to <4 x i32>*
-  %wide.load15 = load <4 x i32>* %9, align 4
-  %.sum22 = or i64 %index, 4
-  %10 = getelementptr i32* %B, i64 %.sum22
-  %11 = bitcast i32* %10 to <4 x i32>*
-  %wide.load16 = load <4 x i32>* %11, align 4
-  %12 = add nsw <4 x i32> %wide.load15, %wide.load
-  %13 = add nsw <4 x i32> %wide.load16, %wide.load14
-  %14 = getelementptr inbounds i32* %C, i64 %index
-  %15 = bitcast i32* %14 to <4 x i32>*
-  store <4 x i32> %12, <4 x i32>* %15, align 4
-  %.sum23 = or i64 %index, 4
-  %16 = getelementptr i32* %C, i64 %.sum23
-  %17 = bitcast i32* %16 to <4 x i32>*
-  store <4 x i32> %13, <4 x i32>* %17, align 4
-  %index.next = add i64 %index, 8
-  %18 = icmp eq i64 %index.next, %n.vec
-  br i1 %18, label %middle.block, label %vector.body
-
-middle.block:                                     ; preds = %vector.body, %vector.memcheck, %.lr.ph
-  %resume.val = phi i64 [ 0, %.lr.ph ], [ 0, %vector.memcheck ], [ %n.vec, %vector.body ]
-  %cmp.n = icmp eq i64 %cnt.cast, %resume.val
-  br i1 %cmp.n, label %._crit_edge, label %scalar.ph
-
-scalar.ph:                                        ; preds = %middle.block, %scalar.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %scalar.ph ], [ %resume.val, %middle.block ]
-  %19 = getelementptr inbounds i32* %A, i64 %indvars.iv
-  %20 = load i32* %19, align 4, !tbaa !0
-  %21 = getelementptr inbounds i32* %B, i64 %indvars.iv
-  %22 = load i32* %21, align 4, !tbaa !0
-  %23 = add nsw i32 %22, %20
-  %24 = getelementptr inbounds i32* %C, i64 %indvars.iv
-  store i32 %23, i32* %24, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %scalar.ph, !llvm.vectorizer.already_vectorized !3
-
-._crit_edge:                                      ; preds = %middle.block, %scalar.ph, %0
-  ret void
-}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-  %1 = getelementptr inbounds i8** %argv, i64 1
-  %2 = load i8** %1, align 8, !tbaa !4
-  %3 = tail call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #2
-  %4 = trunc i64 %3 to i32
-  %sext = shl i64 %3, 32
-  %5 = ashr exact i64 %sext, 30
-  %6 = tail call noalias i8* @malloc(i64 %5) #2
-  %7 = bitcast i8* %6 to i32*
-  %8 = tail call noalias i8* @malloc(i64 %5) #2
-  %9 = bitcast i8* %8 to i32*
-  %10 = tail call noalias i8* @malloc(i64 %5) #2
-  %11 = bitcast i8* %10 to i32*
-  %12 = icmp sgt i32 %4, 0
-  br i1 %12, label %.lr.ph, label %vectorAdd.exit
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %13 = tail call i32 @rand() #2
-  %14 = srem i32 %13, 1000
-  %15 = getelementptr inbounds i32* %7, i64 %indvars.iv
-  store i32 %14, i32* %15, align 4, !tbaa !0
-  %16 = tail call i32 @rand() #2
-  %17 = srem i32 %16, 1000
-  %18 = getelementptr inbounds i32* %9, i64 %indvars.iv
-  store i32 %17, i32* %18, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %4
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph
-  br i1 %12, label %.lr.ph.i, label %vectorAdd.exit
-
-.lr.ph.i:                                         ; preds = %._crit_edge
-  %cnt.cast.i = and i64 %3, 4294967295
-  %n.vec.i = and i64 %3, 4294967288
-  %cmp.zero.i = icmp eq i64 %n.vec.i, 0
-  %19 = add i64 %3, 4294967295
-  %20 = and i64 %19, 4294967295
-  %scevgep.i = getelementptr i32* %11, i64 %20
-  br i1 %cmp.zero.i, label %middle.block.i, label %vector.memcheck.i
-
-vector.memcheck.i:                                ; preds = %.lr.ph.i
-  %scevgep8.i = getelementptr i32* %9, i64 %20
-  %scevgep5.i = getelementptr i32* %7, i64 %20
-  %bound111.i = icmp uge i32* %scevgep.i, %9
-  %bound010.i = icmp uge i32* %scevgep8.i, %11
-  %bound1.i = icmp uge i32* %scevgep.i, %7
-  %bound0.i = icmp uge i32* %scevgep5.i, %11
-  %found.conflict12.i = and i1 %bound010.i, %bound111.i
-  %found.conflict.i = and i1 %bound0.i, %bound1.i
-  %conflict.rdx.i = or i1 %found.conflict.i, %found.conflict12.i
-  br i1 %conflict.rdx.i, label %middle.block.i, label %vector.body.i
-
-vector.body.i:                                    ; preds = %vector.memcheck.i, %vector.body.i
-  %index.i = phi i64 [ %index.next.i, %vector.body.i ], [ 0, %vector.memcheck.i ]
-  %21 = getelementptr inbounds i32* %7, i64 %index.i
-  %22 = bitcast i32* %21 to <4 x i32>*
-  %wide.load.i = load <4 x i32>* %22, align 4
-  %.sum21.i = or i64 %index.i, 4
-  %23 = getelementptr i32* %7, i64 %.sum21.i
-  %24 = bitcast i32* %23 to <4 x i32>*
-  %wide.load14.i = load <4 x i32>* %24, align 4
-  %25 = getelementptr inbounds i32* %9, i64 %index.i
-  %26 = bitcast i32* %25 to <4 x i32>*
-  %wide.load15.i = load <4 x i32>* %26, align 4
-  %27 = getelementptr i32* %9, i64 %.sum21.i
-  %28 = bitcast i32* %27 to <4 x i32>*
-  %wide.load16.i = load <4 x i32>* %28, align 4
-  %29 = add nsw <4 x i32> %wide.load15.i, %wide.load.i
-  %30 = add nsw <4 x i32> %wide.load16.i, %wide.load14.i
-  %31 = getelementptr inbounds i32* %11, i64 %index.i
-  %32 = bitcast i32* %31 to <4 x i32>*
-  store <4 x i32> %29, <4 x i32>* %32, align 4
-  %33 = getelementptr i32* %11, i64 %.sum21.i
-  %34 = bitcast i32* %33 to <4 x i32>*
-  store <4 x i32> %30, <4 x i32>* %34, align 4
-  %index.next.i = add i64 %index.i, 8
-  %35 = icmp eq i64 %index.next.i, %n.vec.i
-  br i1 %35, label %middle.block.i, label %vector.body.i
-
-middle.block.i:                                   ; preds = %vector.body.i, %vector.memcheck.i, %.lr.ph.i
-  %resume.val.i = phi i64 [ 0, %.lr.ph.i ], [ 0, %vector.memcheck.i ], [ %n.vec.i, %vector.body.i ]
-  %cmp.n.i = icmp eq i64 %cnt.cast.i, %resume.val.i
-  br i1 %cmp.n.i, label %vectorAdd.exit, label %scalar.ph.i
-
-scalar.ph.i:                                      ; preds = %middle.block.i, %scalar.ph.i
-  %indvars.iv.i = phi i64 [ %indvars.iv.next.i, %scalar.ph.i ], [ %resume.val.i, %middle.block.i ]
-  %36 = getelementptr inbounds i32* %7, i64 %indvars.iv.i
-  %37 = load i32* %36, align 4, !tbaa !0
-  %38 = getelementptr inbounds i32* %9, i64 %indvars.iv.i
-  %39 = load i32* %38, align 4, !tbaa !0
-  %40 = add nsw i32 %39, %37
-  %41 = getelementptr inbounds i32* %11, i64 %indvars.iv.i
-  store i32 %40, i32* %41, align 4, !tbaa !0
-  %indvars.iv.next.i = add i64 %indvars.iv.i, 1
-  %lftr.wideiv.i = trunc i64 %indvars.iv.next.i to i32
-  %exitcond.i = icmp eq i32 %lftr.wideiv.i, %4
-  br i1 %exitcond.i, label %vectorAdd.exit, label %scalar.ph.i, !llvm.vectorizer.already_vectorized !3
-
-vectorAdd.exit:                                   ; preds = %0, %scalar.ph.i, %._crit_edge, %middle.block.i
-  tail call void @free(i8* %6) #2
-  tail call void @free(i8* %8) #2
-  tail call void @free(i8* %10) #2
-  ret i32 0
-}
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare i64 @strtol(i8*, i8** nocapture, i32) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{}
-!4 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/hpvm/test/vectorAdd/visc_vecadd.ll b/hpvm/test/vectorAdd/visc_vecadd.ll
deleted file mode 100644
index 55a2c030b706d7ba8277bce9434acfdd8622dfa7..0000000000000000000000000000000000000000
--- a/hpvm/test/vectorAdd/visc_vecadd.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; ModuleID = 'vecadd.c'
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-; Function Attrs: nounwind uwtable
-%vecadd_ty = struct { i32* }
-//TODO
-define void @vecadd(i32* %adata, i32* %bdata, i32* %cdata, %vecadd_ty* %out) {
-entry:
-  %node = call i8* @llvm.visc.getNode()
-  %idx = call i32 @llvm.VISC.getNodeInstanceID(%node)
-
-  %idxprom = sext i32 %idx to i64
-  %arrayidxa = getelementptr inbounds i32* %adata, i64 %idxprom
-  %a = load i32* %arrayidxa
-  %arrayidxb = getelementptr inbounds i32* %bdata, i64 %idxprom
-  %b = load i32* %arrayidxb
-
-  %add = add nsw i32 %a, %b
-  %arrayidxc = getelementptr inbounds i32* %cdata, i64 %idxprom
-  store i32 %add, i32* %arrayidxc, align 4
-
-  %ptr = getelementptr inbounds %vecadd_ty* %out, i64 0, i64 0
-  store i32* %cdata, %ptr
-  ret void
-}
-
-define void @vecaddWrapper(i32 %n, i32* %adata, i32* %bdata, i32* %cdata) {
-
-  %wrapper_node = call i8* @llvm.visc.getNode()
-  %kernel_node = call i8* @llvm.visc.createNode1D(@vecadd,%n)
-
-  %edge0 = call i8* @llvm.visc.createEdge(%wrapper_node, %kernel_node, @edgemap0, @argmap0)
-  %edge1 = call i8* @llvm.visc.createEdge(%kernel_node, %wrapper_node, @edgemap1, @argmap1)
-
-}
-
-; Function Attrs: nounwind uwtable
-define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
-; Read input size
-  %1 = getelementptr inbounds i8** %argv, i64 1
-  %2 = load i8** %1, align 8, !tbaa !4
-  %3 = tail call i64 @strtol(i8* nocapture %2, i8** null, i32 10) #2
-  %4 = trunc i64 %3 to i32
-
-; Allocate memory
-  %sext = shl i64 %3, 32
-  %5 = ashr exact i64 %sext, 30
-  %6 = tail call noalias i8* @malloc(i64 %5) #2
-  %7 = bitcast i8* %6 to i32*
-  %8 = tail call noalias i8* @malloc(i64 %5) #2
-  %9 = bitcast i8* %8 to i32*
-  %10 = tail call noalias i8* @malloc(i64 %5) #2
-  %11 = bitcast i8* %10 to i32*
-  %12 = icmp sgt i32 %4, 0
-  br i1 %12, label %.lr.ph, label %vectorAdd.exit
-
-; Initialize memory
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %13 = tail call i32 @rand() #2
-  %14 = srem i32 %13, 1000
-  %15 = getelementptr inbounds i32* %7, i64 %indvars.iv
-  store i32 %14, i32* %15, align 4, !tbaa !0
-  %16 = tail call i32 @rand() #2
-  %17 = srem i32 %16, 1000
-  %18 = getelementptr inbounds i32* %9, i64 %indvars.iv
-  store i32 %17, i32* %18, align 4, !tbaa !0
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %4
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph
-  br i1 %12, label %.lr.ph.i, label %vectorAdd.exit
-
-; Entrance point to the dataflow graph
-.lr.ph.i:                                         ; preds = %._crit_edge
-  %DFfuture = call i8* @llvm.visc.launch(@vecaddWrapper,%4,%7,%9,%11)
-; Get the result from the DF future.
-  %19 = bitcast i8* %DFfuture to %vecadd_ty*
-  %20 = getelementptr %vecadd_ty* %19, i32 0, i32 0
-  %21 = load i32** %20
-
-; Free allocated memory
-vectorAdd.exit:                                   ; preds = %0, %scalar.ph.i,
-                                                  ;         %._crit_edge,
-                                                  ;         %middle.block.i
-  tail call void @free(i8* %6) #2
-  tail call void @free(i8* %8) #2
-  tail call void @free(i8* %10) #2
-  ret i32 0
-}
-
-declare i8* @llvm.visc.launch(i8*, ...)
-
-declare i8* @llvm.visc.getNode() nounwind readnone
-declare i32 @llvm.VISC.getNodeInstanceID(i8*) nounwind readnone
-
-declare i8* @llvm.visc.createNode(i8*) nounwind readnone
-declare i8* @llvm.visc.createNode1D(i8*,i32) nounwind readnone
-
-declare i8* @llvm.visc.createEdge(i8*,i8*,i8*,i8*) nounwind readnone
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i32 @rand() #1
-
-; Function Attrs: nounwind
-declare void @free(i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare i64 @strtol(i8*, i8** nocapture, i32) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
-                  "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false"
-                   "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false"
-                  "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false"
-                  "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
-!0 = metadata !{metadata !"int", metadata !1}
-!1 = metadata !{metadata !"omnipotent char", metadata !2}
-!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-!3 = metadata !{}
-!4 = metadata !{metadata !"any pointer", metadata !1}