diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile
index f74ee8921a534b6963ba06d089398114571d070b..96acb24ed36b2cef49d3882580831d67e7659114 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile
@@ -6,3 +6,4 @@ VISC_OBJS=main.visc.ll
 APP_CUDALDFLAGS=-lm -lstdc++
 APP_CFLAGS=-ffast-math -O3
 APP_CXXFLAGS=-ffast-math -O3
+APP_OPTFLAGS=-unroll-threshold=250 -loop-unroll -scalarrepl
diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
index 25f67c31f21e5eea68966fd69d56805852ad5f5b..6906f419524463786795073320a06060351686d1 100644
--- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
+++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc
@@ -40,23 +40,27 @@ extern char* readFile(const char*);
   }
 
 typedef struct __attribute__((__packed__)) {
-  float *A; size_t bytesA;
-  int lda;
-  float *B; size_t bytesB;
-  int ldb;
-  float *C; size_t bytesC;
-  int ldc;
-  int k;
-  float alpha;
-  float beta;
-  int block_x;
-  int block_y;
-  int grid_x;
-  int grid_y;
-} RootIn;
+    float *A;
+    size_t bytesA;
+    int lda;
+    float *B;
+    size_t bytesB;
+    int ldb;
+    float *C;
+    size_t bytesC;
+    int ldc;
+    int k;
+    float alpha;
+    float beta;
+    int block_x;
+    int block_y;
+    int grid_x;
+    int grid_y;
+}
+RootIn;
 
 void packData(RootIn* args,
-              float *A, size_t bytesA, 
+              float *A, size_t bytesA,
               int lda,
               float *B, size_t bytesB,
               int ldb,
@@ -69,35 +73,35 @@ void packData(RootIn* args,
               int block_y,
               int grid_x,
               int grid_y) {
-  args->A = A;
-  args->bytesA = bytesA;
-  args->lda = lda;
-  args->B = B;
-  args->bytesB = bytesB;
-  args->ldb = ldb;
-  args->C = C;
-  args->bytesC = bytesC;
-  args->ldc = ldc;
-  args->k = k;
-  args->alpha = alpha;
-  args->beta = beta;
-  args->block_x = block_x;
-  args->block_y = block_y;
-  args->grid_x = grid_x;
-  args->grid_y = grid_y;
+    args->A = A;
+    args->bytesA = bytesA;
+    args->lda = lda;
+    args->B = B;
+    args->bytesB = bytesB;
+    args->ldb = ldb;
+    args->C = C;
+    args->bytesC = bytesC;
+    args->ldc = ldc;
+    args->k = k;
+    args->alpha = alpha;
+    args->beta = beta;
+    args->block_x = block_x;
+    args->block_y = block_y;
+    args->grid_x = grid_x;
+    args->grid_y = grid_y;
 }
 
 // TODO: decide between dynamic vs static allocation. Merely a convension - will
 // be translated.
 void Allocation(int block_x, int block_y) {
-  // Memory shared between threadblocks
-  //int bl_x = TILE_N;
-  //int bl_y = TILE_TB_HEIGHT;
-  //void* shB = __visc__malloc(bl_x*bl_y*sizeof(float));
-  void* shB = __visc__malloc(block_x*block_y*sizeof(float));
-  
-  //__visc__return(shB, bl_x*bl_y*sizeof(float)); 
-  __visc__return(shB, block_x*block_y*sizeof(float)); 
+    // Memory shared between threadblocks
+    //int bl_x = TILE_N;
+    //int bl_y = TILE_TB_HEIGHT;
+    //void* shB = __visc__malloc(bl_x*bl_y*sizeof(float));
+    void* shB = __visc__malloc(block_x*block_y*sizeof(float));
+
+    //__visc__return(shB, bl_x*bl_y*sizeof(float));
+    __visc__return(shB, block_x*block_y*sizeof(float));
 }
 
 
@@ -120,85 +124,36 @@ void SgemmLeaf( float* A, size_t bytesA, int lda, float* B, size_t bytesB, int l
 
     float c[TILE_N];
     for (int i=0; i < TILE_N; i++)
-      c[i] = 0.0f;
-    //float c0 = 0;
-    //float c1 = 0;
-    //float c2 = 0;
-    //float c3 = 0;
-    //float c4 = 0;
-    //float c5 = 0;
-    //float c6 = 0;
-    //float c7 = 0;
-    //float c8 = 0;
-    //float c9 = 0;
-    //float c10 = 0;
-    //float c11 = 0;
-    //float c12 = 0;
-    //float c13 = 0;
-    //float c14 = 0;
-    //float c15 = 0;
-   
+        c[i] = 0.0f;
+
     int mid = ly*dimx+lx;
     int m = gx * TILE_M + mid;
     int n = gy * TILE_N + lx;
 
     for (int i = 0; i < k; i+=TILE_TB_HEIGHT) {
-      float a; 
-      //shB[ly][lx] = B[n+(i+ly)*ldb];
-      shB[ly*dimx+lx] = B[n+(i+ly)*ldb];
-
-      __visc__barrier();
-      for (int j = 0; j < TILE_TB_HEIGHT; j++) {
-	      a = A[m + (i+j)*lda];
-              for (int kk = 0; kk < TILE_N; kk++) {
-                      //c[kk] += a * shB[j][kk];
-                      c[kk] += a * shB[j*dimx+kk];
-              }
-              //c0 += a * shB[j*dimx+0];
-              //c1 += a * shB[j*dimx+1];
-              //c2 += a * shB[j*dimx+2];
-              //c3 += a * shB[j*dimx+3];
-              //c4 += a * shB[j*dimx+4];
-              //c5 += a * shB[j*dimx+5];
-              //c6 += a * shB[j*dimx+6];
-              //c7 += a * shB[j*dimx+7];
-              //c8 += a * shB[j*dimx+8];
-              //c9 += a * shB[j*dimx+9];
-              //c10 += a * shB[j*dimx+10];
-              //c11 += a * shB[j*dimx+11];
-              //c12 += a * shB[j*dimx+12];
-              //c13 += a * shB[j*dimx+13];
-              //c14 += a * shB[j*dimx+14];
-              //c15 += a * shB[j*dimx+15];
-
-      }
-      __visc__barrier();
+        float a;
+        //shB[ly][lx] = B[n+(i+ly)*ldb];
+        shB[ly*dimx+lx] = B[n+(i+ly)*ldb];
+
+        __visc__barrier();
+        for (int j = 0; j < TILE_TB_HEIGHT; j++) {
+            a = A[m + (i+j)*lda];
+            for (int kk = 0; kk < TILE_N; kk++) {
+                //c[kk] += a * shB[j][kk];
+                c[kk] += a * shB[j*dimx+kk];
+            }
+        }
+        __visc__barrier();
     }
 
     int t = ldc * gy * TILE_N + m;
     for (int i = 0; i < TILE_N; i++) {
-      C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
+        C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i];
     }
-    //C[t+0*ldc] = C[t+0*ldc] * beta + alpha * c0;
-    //C[t+1*ldc] = C[t+1*ldc] * beta + alpha * c1;
-    //C[t+2*ldc] = C[t+2*ldc] * beta + alpha * c2;
-    //C[t+3*ldc] = C[t+3*ldc] * beta + alpha * c3;
-    //C[t+4*ldc] = C[t+4*ldc] * beta + alpha * c4;
-    //C[t+5*ldc] = C[t+5*ldc] * beta + alpha * c5;
-    //C[t+6*ldc] = C[t+6*ldc] * beta + alpha * c6;
-    //C[t+7*ldc] = C[t+7*ldc] * beta + alpha * c7;
-    //C[t+8*ldc] = C[t+8*ldc] * beta + alpha * c8;
-    //C[t+9*ldc] = C[t+9*ldc] * beta + alpha * c9;
-    //C[t+10*ldc] = C[t+10*ldc] * beta + alpha * c10;
-    //C[t+11*ldc] = C[t+11*ldc] * beta + alpha * c11;
-    //C[t+12*ldc] = C[t+12*ldc] * beta + alpha * c12;
-    //C[t+13*ldc] = C[t+13*ldc] * beta + alpha * c13;
-    //C[t+14*ldc] = C[t+14*ldc] * beta + alpha * c14;
-    //C[t+15*ldc] = C[t+15*ldc] * beta + alpha * c15;
 }
 
 // Thread block node for sgemm - Creates allocation node and leaf (thread) node
-void SgemmTB(float *A, size_t bytesA, 
+void SgemmTB(float *A, size_t bytesA,
              int lda,
              float *B, size_t bytesB,
              int ldb,
@@ -209,38 +164,38 @@ void SgemmTB(float *A, size_t bytesA,
              float beta,
              int block_x,
              int block_y) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
-  void* AllocationNode = __visc__createNode(Allocation);
-  void* SgemmLeafNode = __visc__createNode2D(SgemmLeaf, block_x, block_y);
-
-  // Bind edges
-  __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A
-  __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA
-  __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda
-  __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B
-  __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB
-  __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb
-  __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C
-  __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC
-  __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc
-  __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k
-  __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
-  __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
-
-  __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
-  __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
-  
-
-  // Create Edges between AllocationNode and BFSLeafNodeNode
-  __visc__edge(AllocationNode, SgemmLeafNode, 0, 12, 0); // Edge local_B 
-  __visc__edge(AllocationNode, SgemmLeafNode, 1, 13, 0); // Edge bytes_local_B 
-
-  //TODO: bindOut : for now with out attribute
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(3, A, B, C, 1, C);
+    void* AllocationNode = __visc__createNode(Allocation);
+    void* SgemmLeafNode = __visc__createNode2D(SgemmLeaf, block_x, block_y);
+
+    // Bind edges
+    __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A
+    __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA
+    __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda
+    __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B
+    __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB
+    __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb
+    __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C
+    __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC
+    __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc
+    __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k
+    __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha
+    __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta
+
+    __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x
+    __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y
+
+
+    // Create Edges between AllocationNode and BFSLeafNodeNode
+    __visc__edge(AllocationNode, SgemmLeafNode, 0, 12, 0); // Edge local_B
+    __visc__edge(AllocationNode, SgemmLeafNode, 1, 13, 0); // Edge bytes_local_B
+
+    //TODO: bindOut : for now with out attribute
 }
 
 // Root node for sgemm - Creates thread block node
-void SgemmRoot(float *A, size_t bytesA, 
+void SgemmRoot(float *A, size_t bytesA,
                int lda,
                float *B, size_t bytesB,
                int ldb,
@@ -253,27 +208,27 @@ void SgemmRoot(float *A, size_t bytesA,
                int block_y,
                int grid_x,
                int grid_y) {
-  __visc__hint(visc::CPU_TARGET);
-  __visc__attributes(3, A, B, C, 1, C);
-  void* SgemmTBNode = __visc__createNode2D(SgemmTB, grid_x, grid_y);
-
-  // Bind edges
-  __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A
-  __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA
-  __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda
-  __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B
-  __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB
-  __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb
-  __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C
-  __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC
-  __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc
-  __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k
-  __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
-  __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
-  __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
-  __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
-  
-  //TODO: bindOut : for now with out attribute
+    __visc__hint(visc::CPU_TARGET);
+    __visc__attributes(3, A, B, C, 1, C);
+    void* SgemmTBNode = __visc__createNode2D(SgemmTB, grid_x, grid_y);
+
+    // Bind edges
+    __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A
+    __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA
+    __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda
+    __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B
+    __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB
+    __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb
+    __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C
+    __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC
+    __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc
+    __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k
+    __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha
+    __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta
+    __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x
+    __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y
+
+    //TODO: bindOut : for now with out attribute
 }
 
 // Creates root node for sgemm
@@ -291,8 +246,8 @@ __attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int
 
     // In this code we assume the matrix sizes are multiple of tile size
     if ((m%TILE_M) || (n%TILE_N)) {
-    std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
-              << "; n should be multiple of " << TILE_N << std::endl;
+        std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M
+                  << "; n should be multiple of " << TILE_N << std::endl;
         return;
     }
 
@@ -302,27 +257,27 @@ __attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int
 
 //    unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0);
 
-  int block_x = TILE_N;
-  int block_y = TILE_TB_HEIGHT;
-  int grid_x = m/TILE_M;
-  int grid_y = n/TILE_N;
-  // Pack data in struct
-  RootIn* args = (RootIn*) malloc(sizeof(RootIn));
-  packData(args, 
-          A, bytesA, 
-          lda,
-          B, bytesB,
-          ldb,
-          C, bytesC,
-          ldc,
-          k,
-          alpha,
-          beta,
-          block_x,
-          block_y,
-          grid_x,
-          grid_y
-          );
+    int block_x = TILE_N;
+    int block_y = TILE_TB_HEIGHT;
+    int grid_x = m/TILE_M;
+    int grid_y = n/TILE_N;
+    // Pack data in struct
+    RootIn* args = (RootIn*) malloc(sizeof(RootIn));
+    packData(args,
+             A, bytesA,
+             lda,
+             B, bytesB,
+             ldb,
+             C, bytesC,
+             ldc,
+             k,
+             alpha,
+             beta,
+             block_x,
+             block_y,
+             grid_x,
+             grid_y
+            );
 
     void* sgemmDFG = __visc__launch(0, SgemmRoot, (void*) args);
 
diff --git a/llvm/test/VISC/parboil/common/mk/visc.mk b/llvm/test/VISC/parboil/common/mk/visc.mk
index 6c71fa2b69086b581d60661fa036af525f05ddad..ce2d2c6ecb8293aaa6c5a2555727d9c892259d58 100644
--- a/llvm/test/VISC/parboil/common/mk/visc.mk
+++ b/llvm/test/VISC/parboil/common/mk/visc.mk
@@ -97,6 +97,7 @@ TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS))
 PARBOIL_OBJS = $(call INBUILDDIR,parboil.ll)
 KERNEL = $(TEST_OBJS).kernels.ll
 ifneq ($(TARGET),x86)
+  KERNEL_OPT = $(BUILDDIR)/$(APP).kernels.opt.ll
   KERNEL_LINKED = $(BUILDDIR)/$(APP).kernels.linked.ll
   #KERNEL = $(TEST_OBJS).kernels.ll
   PTX_ASSEMBLY = $(TEST_OBJS).nvptx.s
@@ -142,9 +143,12 @@ clean :
 $(PTX_ASSEMBLY) : $(KERNEL_LINKED)
 	$(CC) $(KERNEL_GEN_FLAGS) -S $< -o $@
 
-$(KERNEL_LINKED) : $(KERNEL)
+$(KERNEL_LINKED) : $(KERNEL_OPT)
 	$(LLVM_LINK) $(LIBCLC_NVPTX_LIB) -S $< -o $@
 
+$(KERNEL_OPT) : $(KERNEL)
+	$(OPT) $(APP_OPTFLAGS) -S $< -o $@
+
 $(BIN) : $(HOST_LINKED)
 	$(CXX) -O3 $(LDFLAGS) $< -o $@