diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
index 36e772486444c3428b3cafb24b8fb0bd2b2ea4a8..4621c021f3738f5d91fd01414f30716b5e8f6b03 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/visc_gemm_opencl.ll
@@ -17,16 +17,6 @@ target triple = "x86_64-unknown-linux-gnu"
 %struct._cl_kernel = type opaque
 %struct._cl_event = type opaque
 
-; Return Type of VISC Compute Matrix Mul
-%rtype = type {float*, i32}
-%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype }
-
-; Function Attrs: nounwind
-declare i8* @llvm.visc.launch(i8*, i8*) #0
-
-; Function Attrs: nounwind
-declare void @llvm.visc.wait(i8*) #0
-
 @.str = private unnamed_addr constant [3 x i8] c"rb\00", align 1
 @.str1 = private unnamed_addr constant [45 x i8] c"Mismatch at %d,%d --- C = %f and goldC = %f\0A\00", align 1
 @.str2 = private unnamed_addr constant [35 x i8] c"Failure to get number of platforms\00", align 1
@@ -80,64 +70,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; Function Attrs: nounwind
 declare i32 @rand() #1
 
-; Function Attrs: nounwind uwtable
-define noalias i8* @LoadProgSource(i8* nocapture %cFilename, i64* %szFinalLength) #0 {
-entry:
-  %call = tail call %struct._IO_FILE* @fopen(i8* %cFilename, i8* getelementptr inbounds ([3 x i8]* @.str, i64 0, i64 0)) #4
-  %cmp = icmp eq %struct._IO_FILE* %call, null
-  br i1 %cmp, label %return, label %if.end
-
-if.end:                                           ; preds = %entry
-  %call1 = tail call i32 @fseek(%struct._IO_FILE* %call, i64 0, i32 2) #4
-  %call2 = tail call i64 @ftell(%struct._IO_FILE* %call) #4
-  %call3 = tail call i32 @fseek(%struct._IO_FILE* %call, i64 0, i32 0) #4
-  %add = add i64 %call2, 1
-  %call4 = tail call noalias i8* @malloc(i64 %add) #4
-  %call5 = tail call i64 @fread(i8* %call4, i64 %call2, i64 1, %struct._IO_FILE* %call) #4
-  %cmp6 = icmp eq i64 %call5, 1
-  %call8 = tail call i32 @fclose(%struct._IO_FILE* %call) #4
-  br i1 %cmp6, label %if.end9, label %if.then7
-
-if.then7:                                         ; preds = %if.end
-  tail call void @free(i8* %call4) #4
-  br label %return
-
-if.end9:                                          ; preds = %if.end
-  %cmp11 = icmp eq i64* %szFinalLength, null
-  br i1 %cmp11, label %if.end13, label %if.then12
-
-if.then12:                                        ; preds = %if.end9
-  store i64 %call2, i64* %szFinalLength, align 8, !tbaa !3
-  br label %if.end13
-
-if.end13:                                         ; preds = %if.end9, %if.then12
-  %arrayidx = getelementptr inbounds i8* %call4, i64 %call2
-  store i8 0, i8* %arrayidx, align 1, !tbaa !1
-  br label %return
-
-return:                                           ; preds = %entry, %if.end13, %if.then7
-  %retval.0 = phi i8* [ null, %if.then7 ], [ %call4, %if.end13 ], [ null, %entry ]
-  ret i8* %retval.0
-}
-
-; Function Attrs: nounwind
-declare noalias %struct._IO_FILE* @fopen(i8* nocapture, i8* nocapture) #1
-
-; Function Attrs: nounwind
-declare i32 @fseek(%struct._IO_FILE* nocapture, i64, i32) #1
-
-; Function Attrs: nounwind
-declare i64 @ftell(%struct._IO_FILE* nocapture) #1
-
-; Function Attrs: nounwind
-declare noalias i8* @malloc(i64) #1
-
-; Function Attrs: nounwind
-declare i64 @fread(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #1
-
-; Function Attrs: nounwind
-declare i32 @fclose(%struct._IO_FILE* nocapture) #1
-
 ; Function Attrs: nounwind
 declare void @free(i8* nocapture) #1
 
@@ -223,330 +155,76 @@ return:                                           ; preds = %for.inc50, %if.then
 ; Function Attrs: nounwind
 declare i32 @printf(i8* nocapture, ...) #1
 
-; Function Attrs: nounwind uwtable
-define void @computeMatrixMul(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C) #0 {
-entry:
-  %dataBytes = alloca i64, align 8
-  %errcode = alloca i32, align 4
-  %d_A = alloca %struct._cl_mem*, align 8
-  %d_B = alloca %struct._cl_mem*, align 8
-  %d_C = alloca %struct._cl_mem*, align 8
-  %numPlatforms = alloca i32, align 4
-  %buffer = alloca [10240 x i8], align 16
-  %properties = alloca [3 x i64], align 16
-  %binaryLength = alloca i64, align 8
-  %clMatrixMul = alloca i8*, align 8
-  %binaryStatus = alloca i32, align 4
-  %localWorkSize = alloca [2 x i64], align 16
-  %globalWorkSize = alloca [2 x i64], align 16
-  %wA = alloca i32, align 4
-  %wC = alloca i32, align 4
-  %call = call i32 @clGetPlatformIDs(i32 0, %struct._cl_platform_id** null, i32* %numPlatforms) #4
-  store i32 %call, i32* %errcode, align 4, !tbaa !4
-  %cmp.i = icmp eq i32 %call, 0
-  br i1 %cmp.i, label %checkErr.exit, label %if.then.i
-
-if.then.i:                                        ; preds = %entry
-  %0 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %0, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([35 x i8]* @.str2, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit:                                    ; preds = %entry
-  %1 = load i32* %numPlatforms, align 4, !tbaa !4
-  %2 = zext i32 %1 to i64
-  %vla = alloca %struct._cl_platform_id*, i64 %2, align 16
-  %call1 = call i32 @clGetPlatformIDs(i32 %1, %struct._cl_platform_id** %vla, i32* null) #4
-  store i32 %call1, i32* %errcode, align 4, !tbaa !4
-  %cmp.i105 = icmp eq i32 %call1, 0
-  br i1 %cmp.i105, label %for.cond.preheader, label %if.then.i107
-
-for.cond.preheader:                               ; preds = %checkErr.exit
-  %3 = load i32* %numPlatforms, align 4, !tbaa !4
-  %cmp148 = icmp eq i32 %3, 0
-  br i1 %cmp148, label %for.end, label %for.body.lr.ph
-
-for.body.lr.ph:                                   ; preds = %for.cond.preheader
-  %4 = getelementptr inbounds [10240 x i8]* %buffer, i64 0, i64 0
-  br label %for.body
-
-if.then.i107:                                     ; preds = %checkErr.exit
-  %5 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i106 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %5, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([28 x i8]* @.str3, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-for.body:                                         ; preds = %for.body.lr.ph, %for.body
-  %i.0149 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
-  call void @llvm.lifetime.start(i64 10240, i8* %4) #4
-  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str4, i64 0, i64 0), i32 %i.0149) #4
-  %idxprom = zext i32 %i.0149 to i64
-  %arrayidx = getelementptr inbounds %struct._cl_platform_id** %vla, i64 %idxprom
-  %6 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
-  %call3 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %6, i32 2304, i64 10240, i8* %4, i64* null) #4
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8]* @.str5, i64 0, i64 0), i8* %4) #4
-  %7 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
-  %call9 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %7, i32 2305, i64 10240, i8* %4, i64* null) #4
-  %call11 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([16 x i8]* @.str6, i64 0, i64 0), i8* %4) #4
-  %8 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
-  %call15 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %8, i32 2306, i64 10240, i8* %4, i64* null) #4
-  %call17 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([13 x i8]* @.str7, i64 0, i64 0), i8* %4) #4
-  %9 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
-  %call21 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %9, i32 2307, i64 10240, i8* %4, i64* null) #4
-  %call23 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8]* @.str8, i64 0, i64 0), i8* %4) #4
-  %10 = load %struct._cl_platform_id** %arrayidx, align 8, !tbaa !5
-  %call27 = call i32 @clGetPlatformInfo(%struct._cl_platform_id* %10, i32 2308, i64 10240, i8* %4, i64* null) #4
-  %call29 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([19 x i8]* @.str9, i64 0, i64 0), i8* %4) #4
-  call void @llvm.lifetime.end(i64 10240, i8* %4) #4
-  %inc = add i32 %i.0149, 1
-  %11 = load i32* %numPlatforms, align 4, !tbaa !4
-  %cmp = icmp ult i32 %inc, %11
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body, %for.cond.preheader
-  %arrayinit.begin = getelementptr inbounds [3 x i64]* %properties, i64 0, i64 0
-  store i64 4228, i64* %arrayinit.begin, align 16, !tbaa !3
-  %arrayinit.element = getelementptr inbounds [3 x i64]* %properties, i64 0, i64 1
-  %12 = load %struct._cl_platform_id** %vla, align 16, !tbaa !5
-  %13 = ptrtoint %struct._cl_platform_id* %12 to i64
-  %sext = shl i64 %13, 32
-  %conv = ashr exact i64 %sext, 32
-  store i64 %conv, i64* %arrayinit.element, align 8, !tbaa !3
-  %arrayinit.element31 = getelementptr inbounds [3 x i64]* %properties, i64 0, i64 2
-  store i64 0, i64* %arrayinit.element31, align 16, !tbaa !3
-  %call33 = call %struct._cl_context* @clCreateContextFromType(i64* %arrayinit.begin, i64 4, void (i8*, i8*, i64, i8*)* null, i8* null, i32* %errcode) #4
-  %14 = load i32* %errcode, align 4, !tbaa !4
-  %cmp.i109 = icmp eq i32 %14, 0
-  br i1 %cmp.i109, label %checkErr.exit112, label %if.then.i111
-
-if.then.i111:                                     ; preds = %for.end
-  %15 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i110 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %15, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([30 x i8]* @.str10, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit112:                                 ; preds = %for.end
-  %call34 = call i32 @clGetContextInfo(%struct._cl_context* %call33, i32 4225, i64 0, i8* null, i64* %dataBytes) #4
-  store i32 %call34, i32* %errcode, align 4, !tbaa !4
-  %16 = load i64* %dataBytes, align 8, !tbaa !3
-  %call35 = call noalias i8* @malloc(i64 %16) #4
-  %17 = bitcast i8* %call35 to %struct._cl_device_id**
-  %call36 = call i32 @clGetContextInfo(%struct._cl_context* %call33, i32 4225, i64 %16, i8* %call35, i64* null) #4
-  %18 = load i32* %errcode, align 4, !tbaa !4
-  %or = or i32 %18, %call36
-  store i32 %or, i32* %errcode, align 4, !tbaa !4
-  %cmp.i113 = icmp eq i32 %or, 0
-  br i1 %cmp.i113, label %checkErr.exit116, label %if.then.i115
-
-if.then.i115:                                     ; preds = %checkErr.exit112
-  %19 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i114 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %19, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([28 x i8]* @.str11, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit116:                                 ; preds = %checkErr.exit112
-  %20 = load %struct._cl_device_id** %17, align 8, !tbaa !5
-  %call38 = call %struct._cl_command_queue* @clCreateCommandQueue(%struct._cl_context* %call33, %struct._cl_device_id* %20, i64 0, i32* %errcode) #4
-  %21 = load i32* %errcode, align 4, !tbaa !4
-  %cmp.i117 = icmp eq i32 %21, 0
-  br i1 %cmp.i117, label %checkErr.exit120, label %if.then.i119
-
-if.then.i119:                                     ; preds = %checkErr.exit116
-  %22 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i118 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %22, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([32 x i8]* @.str12, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit120:                                 ; preds = %checkErr.exit116
-  %conv39 = zext i32 %bytes_C to i64
-  %call40 = call %struct._cl_mem* @clCreateBuffer(%struct._cl_context* %call33, i64 1, i64 %conv39, i8* null, i32* %errcode) #4
-  store %struct._cl_mem* %call40, %struct._cl_mem** %d_C, align 8, !tbaa !5
-  %conv41 = zext i32 %bytes_A to i64
-  %23 = bitcast float* %h_A to i8*
-  %call42 = call %struct._cl_mem* @clCreateBuffer(%struct._cl_context* %call33, i64 33, i64 %conv41, i8* %23, i32* %errcode) #4
-  store %struct._cl_mem* %call42, %struct._cl_mem** %d_A, align 8, !tbaa !5
-  %conv43 = zext i32 %bytes_B to i64
-  %24 = bitcast float* %h_B to i8*
-  %call44 = call %struct._cl_mem* @clCreateBuffer(%struct._cl_context* %call33, i64 33, i64 %conv43, i8* %24, i32* %errcode) #4
-  store %struct._cl_mem* %call44, %struct._cl_mem** %d_B, align 8, !tbaa !5
-  %call45 = call i8* @LoadProgSource(i8* getelementptr inbounds ([18 x i8]* @.str13, i64 0, i64 0), i64* %binaryLength)
-  store i8* %call45, i8** %clMatrixMul, align 8, !tbaa !5
-  %cmp46 = icmp eq i8* %call45, null
-  br i1 %cmp46, label %if.then.i122, label %checkErr.exit123
-
-if.then.i122:                                     ; preds = %checkErr.exit120
-  %25 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i121 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %25, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([31 x i8]* @.str14, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit123:                                 ; preds = %checkErr.exit120
-  %call49 = call %struct._cl_program* @clCreateProgramWithBinary(%struct._cl_context* %call33, i32 1, %struct._cl_device_id** %17, i64* %binaryLength, i8** %clMatrixMul, i32* %binaryStatus, i32* %errcode) #4
-  %26 = load i32* %errcode, align 4, !tbaa !4
-  %cmp.i124 = icmp eq i32 %26, 0
-  br i1 %cmp.i124, label %checkErr.exit127, label %if.then.i126
-
-if.then.i126:                                     ; preds = %checkErr.exit123
-  %27 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i125 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %27, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([38 x i8]* @.str15, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit127:                                 ; preds = %checkErr.exit123
-  %call50 = call i32 @clBuildProgram(%struct._cl_program* %call49, i32 0, %struct._cl_device_id** null, i8* null, void (%struct._cl_program*, i8*)* null, i8* null) #4
-  store i32 %call50, i32* %errcode, align 4, !tbaa !4
-  %cmp.i128 = icmp eq i32 %call50, 0
-  br i1 %cmp.i128, label %checkErr.exit131, label %if.then.i130
-
-if.then.i130:                                     ; preds = %checkErr.exit127
-  %28 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i129 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %28, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([25 x i8]* @.str16, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit131:                                 ; preds = %checkErr.exit127
-  %call51 = call %struct._cl_kernel* @clCreateKernel(%struct._cl_program* %call49, i8* getelementptr inbounds ([10 x i8]* @.str17, i64 0, i64 0), i32* %errcode) #4
-  %29 = load i32* %errcode, align 4, !tbaa !4
-  %cmp.i132 = icmp eq i32 %29, 0
-  br i1 %cmp.i132, label %checkErr.exit135, label %if.then.i134
-
-if.then.i134:                                     ; preds = %checkErr.exit131
-  %30 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i133 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %30, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([25 x i8]* @.str18, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit135:                                 ; preds = %checkErr.exit131
-  store i32 1024, i32* %wA, align 4, !tbaa !4
-  store i32 1024, i32* %wC, align 4, !tbaa !4
-  %31 = bitcast %struct._cl_mem** %d_C to i8*
-  %call52 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 0, i64 8, i8* %31) #4
-  store i32 %call52, i32* %errcode, align 4, !tbaa !4
-  %32 = bitcast %struct._cl_mem** %d_A to i8*
-  %call53 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 1, i64 8, i8* %32) #4
-  %33 = load i32* %errcode, align 4, !tbaa !4
-  %or54 = or i32 %33, %call53
-  store i32 %or54, i32* %errcode, align 4, !tbaa !4
-  %34 = bitcast %struct._cl_mem** %d_B to i8*
-  %call55 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 2, i64 8, i8* %34) #4
-  %35 = load i32* %errcode, align 4, !tbaa !4
-  %or56 = or i32 %35, %call55
-  store i32 %or56, i32* %errcode, align 4, !tbaa !4
-  %36 = bitcast i32* %wA to i8*
-  %call57 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 3, i64 4, i8* %36) #4
-  %37 = load i32* %errcode, align 4, !tbaa !4
-  %or58 = or i32 %37, %call57
-  store i32 %or58, i32* %errcode, align 4, !tbaa !4
-  %38 = bitcast i32* %wC to i8*
-  %call59 = call i32 @clSetKernelArg(%struct._cl_kernel* %call51, i32 4, i64 4, i8* %38) #4
-  %39 = load i32* %errcode, align 4, !tbaa !4
-  %or60 = or i32 %39, %call59
-  store i32 %or60, i32* %errcode, align 4, !tbaa !4
-  %cmp.i136 = icmp eq i32 %or60, 0
-  br i1 %cmp.i136, label %checkErr.exit139, label %if.then.i138
-
-if.then.i138:                                     ; preds = %checkErr.exit135
-  %40 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i137 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %40, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([32 x i8]* @.str19, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit139:                                 ; preds = %checkErr.exit135
-  %arrayidx61 = getelementptr inbounds [2 x i64]* %localWorkSize, i64 0, i64 0
-  store i64 16, i64* %arrayidx61, align 16, !tbaa !3
-  %arrayidx62 = getelementptr inbounds [2 x i64]* %localWorkSize, i64 0, i64 1
-  store i64 16, i64* %arrayidx62, align 8, !tbaa !3
-  %arrayidx63 = getelementptr inbounds [2 x i64]* %globalWorkSize, i64 0, i64 0
-  store i64 1024, i64* %arrayidx63, align 16, !tbaa !3
-  %arrayidx64 = getelementptr inbounds [2 x i64]* %globalWorkSize, i64 0, i64 1
-  store i64 1024, i64* %arrayidx64, align 8, !tbaa !3
-  %call67 = call i32 @clEnqueueNDRangeKernel(%struct._cl_command_queue* %call38, %struct._cl_kernel* %call51, i32 2, i64* null, i64* %arrayidx63, i64* %arrayidx61, i32 0, %struct._cl_event** null, %struct._cl_event** null) #4
-  store i32 %call67, i32* %errcode, align 4, !tbaa !4
-  %cmp.i140 = icmp eq i32 %call67, 0
-  br i1 %cmp.i140, label %checkErr.exit143, label %if.then.i142
-
-if.then.i142:                                     ; preds = %checkErr.exit139
-  %41 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i141 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %41, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([26 x i8]* @.str20, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit143:                                 ; preds = %checkErr.exit139
-  %42 = load %struct._cl_mem** %d_C, align 8, !tbaa !5
-  %43 = bitcast float* %h_C to i8*
-  %call69 = call i32 @clEnqueueReadBuffer(%struct._cl_command_queue* %call38, %struct._cl_mem* %42, i32 1, i64 0, i64 %conv39, i8* %43, i32 0, %struct._cl_event** null, %struct._cl_event** null) #4
-  store i32 %call69, i32* %errcode, align 4, !tbaa !4
-  %cmp.i144 = icmp eq i32 %call69, 0
-  br i1 %cmp.i144, label %checkErr.exit147, label %if.then.i146
-
-if.then.i146:                                     ; preds = %checkErr.exit143
-  %44 = load %struct._IO_FILE** @stderr, align 8, !tbaa !5
-  %call.i145 = call i32 (%struct._IO_FILE*, i8*, ...)* @fprintf(%struct._IO_FILE* %44, i8* getelementptr inbounds ([11 x i8]* @.str25, i64 0, i64 0), i8* getelementptr inbounds ([23 x i8]* @.str21, i64 0, i64 0)) #4
-  call void @exit(i32 1) #7
-  unreachable
-
-checkErr.exit147:                                 ; preds = %checkErr.exit143
-  %45 = load %struct._cl_mem** %d_A, align 8, !tbaa !5
-  %call70 = call i32 @clReleaseMemObject(%struct._cl_mem* %45) #4
-  %46 = load %struct._cl_mem** %d_C, align 8, !tbaa !5
-  %call71 = call i32 @clReleaseMemObject(%struct._cl_mem* %46) #4
-  %47 = load %struct._cl_mem** %d_B, align 8, !tbaa !5
-  %call72 = call i32 @clReleaseMemObject(%struct._cl_mem* %47) #4
-  call void @free(i8* %call35) #4
-  %48 = load i8** %clMatrixMul, align 8, !tbaa !5
-  call void @free(i8* %48) #4
-  %call73 = call i32 @clReleaseContext(%struct._cl_context* %call33) #4
-  %call74 = call i32 @clReleaseKernel(%struct._cl_kernel* %call51) #4
-  %call75 = call i32 @clReleaseProgram(%struct._cl_program* %call49) #4
-  %call76 = call i32 @clReleaseCommandQueue(%struct._cl_command_queue* %call38) #4
-  ret void
-}
-
-declare i32 @clGetPlatformIDs(i32, %struct._cl_platform_id**, i32*) #3
-
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.start(i64, i8* nocapture) #4
 
-declare i32 @clGetPlatformInfo(%struct._cl_platform_id*, i32, i64, i8*, i64*) #3
-
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end(i64, i8* nocapture) #4
 
-declare %struct._cl_context* @clCreateContextFromType(i64*, i64, void (i8*, i8*, i64, i8*)*, i8*, i32*) #3
+; --------------- VISC Intrinsics ---------------
+; Return Type of VISC Compute Matrix Mul
+%rtype = type {float*, i32}
+%struct.arg = type { float*, i32, float*, i32, float*, i32, i32, i32, i32, %rtype }
 
-declare i32 @clGetContextInfo(%struct._cl_context*, i32, i64, i8*, i64*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.visc.launch(i8*, i8*) #0
 
-declare %struct._cl_command_queue* @clCreateCommandQueue(%struct._cl_context*, %struct._cl_device_id*, i64, i32*) #3
+; Function Attrs: nounwind
+declare void @llvm.visc.wait(i8*) #0
 
-declare %struct._cl_mem* @clCreateBuffer(%struct._cl_context*, i64, i64, i8*, i32*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode(i8*) #0
 
-declare %struct._cl_program* @clCreateProgramWithBinary(%struct._cl_context*, i32, %struct._cl_device_id**, i64*, i8**, i32*, i32*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode1D(i8*, i32) #0
 
-declare i32 @clBuildProgram(%struct._cl_program*, i32, %struct._cl_device_id**, i8*, void (%struct._cl_program*, i8*)*, i8*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode2D(i8*, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createNode3D(i8*, i32, i32, i32) #0
+
+; Function Attrs: nounwind
+declare i8* @llvm.visc.createEdge(i8*, i8*, i1, i32, i32) #0
 
-declare %struct._cl_kernel* @clCreateKernel(%struct._cl_program*, i8*, i32*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getNode() #0
 
-declare i32 @clSetKernelArg(%struct._cl_kernel*, i32, i64, i8*) #3
+; Function Attrs: nounwind
+declare i8* @llvm.visc.getParentNode(i8*) #0
 
-declare i32 @clEnqueueNDRangeKernel(%struct._cl_command_queue*, %struct._cl_kernel*, i32, i64*, i64*, i64*, i32, %struct._cl_event**, %struct._cl_event**) #3
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumDims(i8*) #0
 
-declare i32 @clEnqueueReadBuffer(%struct._cl_command_queue*, %struct._cl_mem*, i32, i64, i64, i8*, i32, %struct._cl_event**, %struct._cl_event**) #3
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumNodeInstances.x(i8*) #0
 
-declare i32 @clReleaseMemObject(%struct._cl_mem*) #3
+; Function Attrs: nounwind
+declare i32 @llvm.visc.getNumNodeInstances.y(i8*) #0
 
-declare i32 @clReleaseContext(%struct._cl_context*) #3
+; Function Attrs: nounwind
+declare void @llvm.visc.bind.input(i8*, i32, i32)
 
-declare i32 @clReleaseKernel(%struct._cl_kernel*) #3
+; Function Attrs: nounwind
+declare void @llvm.visc.bind.output(i8*, i32, i32)
+; ----------------- VISC intrinsics end ------------------
 
-declare i32 @clReleaseProgram(%struct._cl_program*) #3
 
-declare i32 @clReleaseCommandQueue(%struct._cl_command_queue*) #3
 
 ; Function Attrs: nounwind uwtable
 define %rtype @matrixMul(float* nocapture %A, i32 %bytes_A, float* nocapture %B, i32 %bytes_B, float* %C, i32 %bytes_C, i32 %k, i32 %n, i32 %m) #0 {
 entry:
-  %call = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #2
-  %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #2
+  ; ------------------------- VISC changes ------------------
+  ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
+  ; Replaced statement -- %call = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 0) #2
+  %this_node = call i8* @llvm.visc.getNode()
+  %call = tail call i32 @llvm.visc.getNumNodeInstances.x(i8* %this_node)
+
+  ; Replace get_global_id calls with calls to getNode followed but getNumNodeInstances.x
+  ; Replaced statement -- %call1 = tail call i32 (i32, ...)* bitcast (i32 (...)* @get_global_id to i32 (i32, ...)*)(i32 1) #2
+  %call1 = tail call i32 @llvm.visc.getNumNodeInstances.y(i8* %this_node)
+  ; ---------------------- VISC changes End ------------------
+  
   %cmp22 = icmp sgt i32 %k, 0
   br i1 %cmp22, label %for.body.lr.ph, label %for.end
 
@@ -581,9 +259,9 @@ for.end:                                          ; preds = %for.body, %entry
   %idxprom10 = sext i32 %add9 to i64
   %arrayidx11 = getelementptr inbounds float* %C, i64 %idxprom10
   store float %res.0.lcssa, float* %arrayidx11, align 4, !tbaa !0
-  %.fca.0.insert = insertvalue { float*, i32 } undef, float* %C, 0
-  %.fca.1.insert = insertvalue { float*, i32 } %.fca.0.insert, i32 %bytes_C, 1
-  ret { float*, i32 } %.fca.1.insert
+  %.fca.0.insert = insertvalue %rtype undef, float* %C, 0
+  %.fca.1.insert = insertvalue %rtype %.fca.0.insert, i32 %bytes_C, 1
+  ret %rtype %.fca.1.insert
 }
 
 define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_B, float* %h_C, i32 %bytes_C, i32 %WA, i32 %WB, i32 %HA) {
@@ -604,6 +282,9 @@ define %rtype @MatrixMulRoot(float* %h_A, i32 %bytes_A, float* %h_B, i32 %bytes_
   ret %rtype zeroinitializer
 }
 
+; Function Attrs: nounwind
+declare noalias i8* @malloc(i64) #1
+
 ; Function Attrs: nounwind uwtable
 define i32 @main(i32 %argc, i8** nocapture %argv) #0 {
 entry:
@@ -679,7 +360,7 @@ randomInit.exit41:                                ; preds = %for.body.i40
   call void @llvm.visc.wait(i8* %graphID)
 
   ; Get the result
-  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 4
+  %out.addr = getelementptr %struct.arg* %in.addr, i32 0, i32 9
   %out = load %rtype* %out.addr
   %out.h_C = extractvalue %rtype %out, 0
   ;%2 = extractvalue %rtype %out, 0