diff --git a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
index cde03bd6d0ffa9969c785e17fe2f708c75396158..33a54cd0de626113e5cf11e2f6a6928d4fa384eb 100644
--- a/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
+++ b/llvm/projects/hpvm-tensor-rt/bin/install_runtime.sh
@@ -3,11 +3,9 @@
 export HPVM_TENSOR_RT_HOME=/home/hsharif3/Gitlab/hpvm/llvm/projects/hpvm-tensor-rt/
 export PATH=/home/hsharif3/Gitlab/hpvm/build/bin/:$PATH
 
-clang++ -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
+clang++ -I/software/cuda-9.1/include -emit-llvm -c ${HPVM_TENSOR_RT_HOME}/tensor_runtime/include/tensor_signatures.cc -o ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
 llvm-dis --version
 llvm-dis ${HPVM_TENSOR_RT_HOME}/lib/tensor_runtime.bc
-cp ${HPVM_TENSOR_RT_HOME}/build/libtensor_runtime.a  ${HPVM_TENSOR_RT_HOME}/lib/libtensor_runtime.a
-cp ${HPVM_TENSOR_RT_HOME}/build_autotuner/libtensor_runtime.a  ${HPVM_TENSOR_RT_HOME}/lib/libtensor_autotuner.a
 
 
 
diff --git a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
index 3e48a094b89ac506cf50f712a0d60b1bac95f75d..89c8da90f8ab740062bd84cdd365baa67311a7a4 100644
--- a/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
+++ b/llvm/projects/hpvm-tensor-rt/lib/tensor_runtime.ll
@@ -8,8 +8,8 @@ define void @_Z13dummyFunctionv() #0 {
 entry:
   %initRT = alloca i8*, align 8
   %cleanRT = alloca i8*, align 8
-  %initApproxhpvmRT = alloca i8*, align 8
-  %cleaApproxhpvmRT = alloca i8*, align 8
+  %initApproxRT = alloca i8*, align 8
+  %cleanApproxRT = alloca i8*, align 8
   %initRTController = alloca i8*, align 8
   %cleanRTController = alloca i8*, align 8
   %request_tensorPtr = alloca i8*, align 8
@@ -44,17 +44,18 @@ entry:
   %ConvLayer = alloca i8*, align 8
   %FCLayer = alloca i8*, align 8
   %ConvLayer2 = alloca i8*, align 8
+  %ConvLayer3 = alloca i8*, align 8
   %FCLayer2 = alloca i8*, align 8
   %AddWrapper = alloca i8*, align 8
   %ReluWrapper = alloca i8*, align 8
   %TanhWrapper = alloca i8*, align 8
   %BatchNormWrapper = alloca i8*, align 8
   %PoolingWrapper = alloca i8*, align 8
-  %SoftmaxWrapper = alloca i8*, align 8
+  %softmaxWrapper = alloca i8*, align 8
   store i8* bitcast (void (i32)* @llvm_hpvm_initTensorRt to i8*), i8** %initRT, align 8
   store i8* bitcast (void ()* @llvm_hpvm_cleanupTensorRt to i8*), i8** %cleanRT, align 8
-  store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxhpvmRT, align 8
-  store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleaApproxhpvmRT, align 8
+  store i8* bitcast (void (i32)* @llvm_hpvm_initApproxhpvmRt to i8*), i8** %initApproxRT, align 8
+  store i8* bitcast (void ()* @llvm_hpvm_cleanupApproxhpvmRt to i8*), i8** %cleanApproxRT, align 8
   store i8* bitcast (void (i8*, i8*)* @llvm_hpvm_initializeRuntimeController to i8*), i8** %initRTController, align 8
   store i8* bitcast (void ()* @llvm_hpvm_clearRuntimeController to i8*), i8** %cleanRTController, align 8
   store i8* bitcast (void (i8*, i32)* @hpvm_request_tensor to i8*), i8** %request_tensorPtr, align 8
@@ -89,13 +90,14 @@ entry:
   store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, i32, i32, i32, i32, i32, i32, float, float, i32)* @ConvLayer_PROMISE to i8*), i8** %ConvLayer, align 8
   store i8* bitcast (i8* (i8*, float, float, i8*, float, float, i8*, float, float, i32, float, float, i32)* @FCLayer_PROMISE to i8*), i8** %FCLayer, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float)* @wrapper_ConvLayer to i8*), i8** %ConvLayer2, align 8
+  store i8* bitcast (i8* (i8*, i8*, i8*, i32, i32, i32, i32, i32, i32)* @wrapper_tensorGroupConvolution to i8*), i8** %ConvLayer3, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i32, float, float)* @wrapper_FCLayer to i8*), i8** %FCLayer2, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*)* @wrapper_tensorAdd to i8*), i8** %AddWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorRelu to i8*), i8** %ReluWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorTanh to i8*), i8** %TanhWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*, i8*, i8*, i8*, i8*, double)* @wrapper_tensorBatchNorm to i8*), i8** %BatchNormWrapper, align 8
   store i8* bitcast (i8* (i8*, i8*, i32, i32, i32, i32, i32, i32, i32)* @wrapper_tensorPooling to i8*), i8** %PoolingWrapper, align 8
-  store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %SoftmaxWrapper, align 8
+  store i8* bitcast (i8* (i8*, i8*)* @wrapper_tensorSoftmax to i8*), i8** %softmaxWrapper, align 8
   ret void
 }
 
@@ -175,6 +177,8 @@ declare i8* @FCLayer_PROMISE(i8*, float, float, i8*, float, float, i8*, float, f
 
 declare i8* @wrapper_ConvLayer(i8*, i8*, i8*, i8*, i32, i32, i32, i32, i32, i32, i32, float, float) #1
 
+declare i8* @wrapper_tensorGroupConvolution(i8*, i8*, i8*, i32, i32, i32, i32, i32, i32) #1
+
 declare i8* @wrapper_FCLayer(i8*, i8*, i8*, i8*, i32, float, float) #1
 
 declare i8* @wrapper_tensorAdd(i8*, i8*, i8*) #1