diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/Makefile b/llvm/test/VISC/gemm_opencl/matrixMul/Makefile
index 816e85da5fed3c3df732151a7a21aa0a84fc4563..8426617b1812c901bf315424b5f2d7b5d9f164d5 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul/Makefile
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/Makefile
@@ -1,24 +1,30 @@
-PASSES := 
+PASSES :=
 
 .PHONY: clean
 
-LIBCLC:=/home/kotsifa2/llvm/libclc
-HOST:=sgemm
+LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
+LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
+HOST:=gemm_opencl
 KERNELS:=matrixMul
+LLVM_CC:=$(LLVM_INSTALL)/bin/clang
+LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
 
-all: $(KERNELS:%=%.ll) $(HOST:%=%.ll)
+all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin)
 
 $(KERNELS:%=%.ll):%.ll:%.cl
-	clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
+	$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
 
 $(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
-	llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
+	$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
 
 $(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
-	clang -target nvptx $< -S -o $@
+	$(LLVM_CC) -O3 -target nvptx $< -S -o $@
 
 $(HOST:%=%.ll):%.ll:%.c
-	clang -O3 -S -emit-llvm $< -o $@
+	$(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@
+
+$(HOST:%=%.bin):%.bin:%.c
+	$(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@
 
 clean :
-	rm -f *.ll *.bc *.s
+	rm -f *.ll *.bc *.s *.bin
diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c b/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c
index 9b83609cc0d4117f9b333e4ada759c4854cd17bb..89c535add07650ba36e26c45244a420f2ef2801f 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c
@@ -1,7 +1,8 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#include <oclUtils.h>
+#include <string.h>
+#include <CL/cl.h>
 
 #define WA 1024
 #define HA 1024
@@ -10,10 +11,12 @@
 #define WC WB
 #define HC HA
 
+
+
 // Thread block size
 #define BLOCK_SIZE 16
 
-inline void checkErr(cl_int err, cl_int success, const char * name) {
+static inline void checkErr(cl_int err, cl_int success, const char * name) {
   if (err != success) {
     fprintf(stderr, "ERROR: %s\n", name);
     exit(EXIT_FAILURE);
@@ -25,13 +28,93 @@ void randomInit(float* data, int size) {
   for (int i = 0; i < size; ++i)
     data[i] = rand() / (float)RAND_MAX;
 }
- 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+char* LoadProgSource(const char* cFilename, size_t* szFinalLength)
+{
+    // locals
+    FILE* pFileStream = NULL;
+    size_t szSourceLength;
+
+    // open the OpenCL source code file
+    #ifdef _WIN32   // Windows version
+        if(fopen_s(&pFileStream, cFilename, "rb") != 0)
+        {
+            return NULL;
+        }
+    #else           // Linux version
+        pFileStream = fopen(cFilename, "rb");
+        if(pFileStream == 0)
+        {
+            return NULL;
+        }
+    #endif
+
+    // get the length of the source code
+    fseek(pFileStream, 0, SEEK_END);
+    szSourceLength = ftell(pFileStream);
+    fseek(pFileStream, 0, SEEK_SET);
+
+    // allocate a buffer for the source code string and read it in
+    char* cSourceString = (char *)malloc(szSourceLength + 1);
+    if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
+    {
+        fclose(pFileStream);
+        free(cSourceString);
+        return 0;
+    }
+
+    // close the file and return the total length of the combined (preamble + source) string
+    fclose(pFileStream);
+    if(szFinalLength != 0)
+    {
+        *szFinalLength = szSourceLength;
+    }
+    cSourceString[szSourceLength] = '\0';
+
+    return cSourceString;
+}
+
+// Check bool
+int isEqual(float a, float b) {
+  return (fabs(a-b) < 0.001);
+}
+
+// Check Results
+
+int checkResults(float* A, float* B, float* C) {
+  unsigned int size_A = WA * HA;
+  unsigned int size_B = WB * HB;
+  unsigned int size_C = WC * HC;
+  unsigned int bytesC = sizeof(float) * size_C;
+  float* goldC = (float*) malloc(bytesC);
+  for (int i=0; i < HC; i++) {
+    for (int j=0; j < WC; j++) {
+      goldC[i*WC + j] = 0;
+      for (int k=0; k < HB; k++) {
+        goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
+      }
+      if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
+        printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
+        return 0;
+      }
+    }
+  }
+  return 1; // Success
+}
+
 // Main
 int main(int argc, char** argv) {
 
   // seed for rand()
   srand(2006);
- 
+
   // Allocate host memory for matrices A and B
   unsigned int size_A = WA * HA;
   unsigned int bytes_A = sizeof(float) * size_A;
@@ -40,12 +123,12 @@ int main(int argc, char** argv) {
   unsigned int size_B = WB * HB;
   unsigned int bytes_B = sizeof(float) * size_B;
   float* h_B = (float*) malloc(bytes_B);
- 
+
    // Initialize host memory
    randomInit(h_A, size_A);
    randomInit(h_B, size_B);
 
-/* 
+/*
    // Print A and B
    printf("\n\nMatrix A\n");
    for(int i = 0; i < size_A; i++)
@@ -54,7 +137,7 @@ int main(int argc, char** argv) {
       if(((i + 1) % WA) == 0)
       printf("\n");
    }
- 
+
    printf("\n\nMatrix B\n");
    for(int i = 0; i < size_B; i++)
    {
@@ -68,7 +151,7 @@ int main(int argc, char** argv) {
   unsigned int size_C = WC * HC;
   unsigned int bytes_C = sizeof(float) * size_C;
   float* h_C = (float*) malloc(bytes_C);
- 
+
    // Initialize OpenCL
 
    // OpenCL specific variables
@@ -76,7 +159,7 @@ int main(int argc, char** argv) {
   cl_command_queue clCommandQue;
   cl_program clProgram;
   cl_kernel clKernel;
- 
+
   size_t dataBytes;
   size_t kernelLength;
   cl_int errcode;
@@ -89,47 +172,86 @@ int main(int argc, char** argv) {
   /*****************************************/
   /* Initialize OpenCL */
   /*****************************************/
-  clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,
+  // query the number of platforms
+  cl_uint numPlatforms;
+  errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
+  checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
+
+  // now get all the platform IDs
+  cl_platform_id platforms[numPlatforms];
+  errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
+  checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
+
+  for(unsigned i=0; i < numPlatforms; i++) {
+    char buffer[10240];
+    printf("  -- %d --\n", i);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
+    printf("  PROFILE = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
+    printf("  VERSION = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
+    printf("  NAME = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
+    printf("  VENDOR = %s\n", buffer);
+    clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+    printf("  EXTENSIONS = %s\n", buffer);
+  }
+  // set platform property - just pick the first one
+  cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
+                                        (int) platforms[0],
+                                        0};
+  clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
                                          NULL, NULL, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
+  checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
 
   // get the list of GPU devices associated with context
   errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0,
                               NULL, &dataBytes);
   cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
-  errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, 
+  errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes,
                               clDevices, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
+  checkErr(errcode, CL_SUCCESS, "Failure to get context info");
 
   //Create a command-queue
   clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
-  
+  checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
+
   // Setup device memory
   d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
                        &errcode);
-  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+  d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                        bytes_A, h_A, &errcode);
-  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, 
+  d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
                        bytes_B, h_B, &errcode);
- 
- 
+
+
    // Load and build OpenCL kernel
-  char *clMatrixMul = oclLoadProgSource("kernel.cl",
-                                        "// My comment\n", 
-                                        &kernelLength);
-  shrCheckError(clMatrixMul != NULL, shrTRUE);
- 
-  clProgram = clCreateProgramWithSource(clGPUContext, 1,
-                                        (const char **)&clMatrixMul, 
+  /*char *clMatrixMul = LoadProgSource("matrixMul.cl",
+                                        "// My comment\n",
+                                        &kernelLength);*/
+  //checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program");
+
+  /*clProgram = clCreateProgramWithSource(clGPUContext, 1,
+                                        (const char **)&clMatrixMul,
                                         &kernelLength, &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
+  checkErr(errcode, CL_SUCCESS, "Failure to create program from source");
+*/
+  size_t binaryLength;
+  char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &binaryLength);
+  checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary");
+  
+  cl_int binaryStatus;
+  clProgram = clCreateProgramWithBinary(clGPUContext, 1, &clDevices[0],
+                                        &binaryLength,
+                                        (const unsigned char **)&clMatrixMul,
+                                        &binaryStatus, &errcode);
+  checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
 
   errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
- 
+  checkErr(errcode, CL_SUCCESS, "Failure to build program");
+
   clKernel = clCreateKernel(clProgram, "matrixMul", &errcode);
-  shrCheckError(errcode, CL_SUCCESS);
+  checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
 
 
   // Launch OpenCL kernel
@@ -142,7 +264,7 @@ int main(int argc, char** argv) {
   errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
   errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA);
   errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC);
-  shrCheckError(errcode, CL_SUCCESS);
+  checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments");
  
   localWorkSize[0] = BLOCK_SIZE;
   localWorkSize[1] = BLOCK_SIZE;
@@ -152,12 +274,12 @@ int main(int argc, char** argv) {
   errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, 
                                    globalWorkSize, localWorkSize,
                                    0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
+  checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
  
   // Retrieve result from device
   errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, 
                                 h_C, 0, NULL, NULL);
-  shrCheckError(errcode, CL_SUCCESS);
+  checkErr(errcode, CL_SUCCESS, "Failure to read buffer");
  
   // Print out the result
 /*
@@ -169,6 +291,10 @@ int main(int argc, char** argv) {
   }
   printf("\n");
  */
+  if(checkResults(h_A, h_B, h_C))
+    printf("\nPass!\n");
+  else
+    printf("\nFailed!\n");
   printf("\nDone!\n");
 
   // Deallocate memory
diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl b/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl
index d3926a9f9f1bbc4d41e473ea8851ea3bc520e878..936d354542da6c36400903238600f698f0087138 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl
+++ b/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl
@@ -18,7 +18,6 @@ __kernel void matrixMul(__global float* C,
   for (int i = 0; i < k; i++) {
     res += A[ty*k+i] * B[i*n+tx];
   }
-
   // Write in device memory
   C[ty*n+tx] = res;
 
diff --git a/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile b/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile
index f4c3d9b296b18baa1e60b58c705878e1a256a3cb..1984e14c78f3fabd3f2b92a98b81919dbaf2b979 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile
+++ b/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile
@@ -1,24 +1,27 @@
-PASSES := 
+PASSES :=
 
 .PHONY: clean
 
-LIBCLC:=/home/kotsifa2/llvm/libclc
+LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
+LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
 HOST:=sgemm
 KERNELS:=matrixMul_bc
+LLVM_CC:=$(LLVM_INSTALL)/bin/clang
+LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
 
 all: $(KERNELS:%=%.ll) $(HOST:%=%.ll)
 
 $(KERNELS:%=%.ll):%.ll:%.cl
-	clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
+	$(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
 
 $(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
-	llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
+	$(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
 
 $(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
-	clang -target nvptx $< -S -o $@
+	$(LLVM_CC) -target nvptx $< -S -o $@
 
 $(HOST:%=%.ll):%.ll:%.c
-	clang -O3 -S -emit-llvm $< -o $@
+	$(LLVM_CC) -O3 -S -emit-llvm $< -o $@
 
 clean :
 	rm -f *.ll *.bc *.s
diff --git a/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c b/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c
index 9b83609cc0d4117f9b333e4ada759c4854cd17bb..31cd7502ea360592ea845c9705de2568f35d6de9 100644
--- a/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c
+++ b/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c
@@ -1,7 +1,8 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
-#include <oclUtils.h>
+//#include <oclUtils.h>
+#include <CL/cl.h>
 
 #define WA 1024
 #define HA 1024
@@ -118,7 +119,7 @@ int main(int argc, char** argv) {
   char *clMatrixMul = oclLoadProgSource("kernel.cl",
                                         "// My comment\n", 
                                         &kernelLength);
-  shrCheckError(clMatrixMul != NULL, shrTRUE);
+  shrCheckError(clMatrixMul != NULL, CL_SUCCESS);
  
   clProgram = clCreateProgramWithSource(clGPUContext, 1,
                                         (const char **)&clMatrixMul,