From 25f2cc89a3444648bcadd26058d0d2cd4654039b Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@gmail.com>
Date: Sat, 13 Jun 2015 19:40:16 -0500
Subject: [PATCH] Modified lbm to use precompiled ptx binary

---
 .../lbm/src/opencl_nvidia/kernel_offline.cl   | 176 ++++++++++
 .../src/opencl_nvidia/kernel_offline.nvptx.s  | 261 ++++++++++++++
 .../benchmarks/lbm/src/opencl_nvidia/main.c   | 318 +++++++++---------
 3 files changed, 597 insertions(+), 158 deletions(-)
 create mode 100644 llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl
 create mode 100644 llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.nvptx.s

diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl
new file mode 100644
index 0000000000..3f34ea5ef2
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.cl
@@ -0,0 +1,176 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef LBM_KERNEL_CL
+#define LBM_KERNEL_CL
+
+#include "layout_config.h"
+#include "lbm_macros.h"
+/******************************************************************************/
+
+__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid )
+{
+	srcGrid += MARGIN;
+	dstGrid += MARGIN;
+
+
+	//Using some predefined macros here.  Consider this the declaration 
+        //  and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
+
+        SWEEP_VAR
+	SWEEP_X = get_local_id(0);
+	SWEEP_Y = get_group_id(0);
+	SWEEP_Z = get_group_id(1);
+	
+	float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
+	float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ;
+	float tempSB, tempET, tempEB, tempWT, tempWB ;
+
+	//Load all of the input fields
+	//This is a gather operation of the SCATTER preprocessor variable
+        // is undefined in layout_config.h, or a "local" read otherwise
+	tempC = SRC_C(srcGrid);
+
+	tempN = SRC_N(srcGrid);
+	tempS = SRC_S(srcGrid);
+	tempE = SRC_E(srcGrid);
+	tempW = SRC_W(srcGrid);
+	tempT = SRC_T(srcGrid);
+	tempB = SRC_B(srcGrid);
+
+	tempNE = SRC_NE(srcGrid);
+	tempNW = SRC_NW(srcGrid);
+	tempSE = SRC_SE(srcGrid);
+	tempSW = SRC_SW(srcGrid);
+	tempNT = SRC_NT(srcGrid);
+	tempNB = SRC_NB(srcGrid);
+	tempST = SRC_ST(srcGrid);
+	tempSB = SRC_SB(srcGrid);
+	tempET = SRC_ET(srcGrid);
+	tempEB = SRC_EB(srcGrid);
+	tempWT = SRC_WT(srcGrid);
+	tempWB = SRC_WB(srcGrid);
+
+	//Test whether the cell is fluid or obstacle
+	if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) {
+		
+		//Swizzle the inputs: reflect any fluid coming into this cell 
+		// back to where it came from
+		temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ;
+		temp_swp = tempE ; tempE = tempW ; tempW = temp_swp;
+		temp_swp = tempT ; tempT = tempB ; tempB = temp_swp;
+		temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp;
+		temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp;
+		temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp; 
+		temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp;
+		temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp;
+		temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp;
+	}
+	else {
+ 
+                //The math meat of LBM: ignore for optimization
+	        float ux, uy, uz, rho, u2;
+		float temp1, temp2, temp_base;
+		rho = tempC + tempN
+			+ tempS + tempE
+			+ tempW + tempT
+			+ tempB + tempNE
+			+ tempNW + tempSE
+			+ tempSW + tempNT
+			+ tempNB + tempST
+			+ tempSB + tempET
+			+ tempEB + tempWT
+			+ tempWB;
+
+		ux = + tempE - tempW
+			+ tempNE - tempNW
+			+ tempSE - tempSW
+			+ tempET + tempEB
+			- tempWT - tempWB;
+
+		uy = + tempN - tempS
+			+ tempNE + tempNW
+			- tempSE - tempSW
+			+ tempNT + tempNB
+			- tempST - tempSB;
+
+		uz = + tempT - tempB
+			+ tempNT - tempNB
+			+ tempST - tempSB
+			+ tempET - tempEB
+			+ tempWT - tempWB;		
+		
+		ux /= rho;
+		uy /= rho;
+		uz /= rho;
+
+		if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) {
+
+			ux = 0.005f;
+			uy = 0.002f;
+			uz = 0.000f;
+		}
+
+		u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f;
+		temp_base = OMEGA*rho;
+		temp1 = DFL1*temp_base;
+
+		//Put the output values for this cell in the shared memory
+		temp_base = OMEGA*rho;
+		temp1 = DFL1*temp_base;
+		temp2 = 1.0f-OMEGA;
+		tempC = temp2*tempC + temp1*(                                 - u2);
+	        temp1 = DFL2*temp_base;	
+		tempN = temp2*tempN + temp1*(       uy*(4.5f*uy       + 3.0f) - u2);
+		tempS = temp2*tempS + temp1*(       uy*(4.5f*uy       - 3.0f) - u2);
+		tempT = temp2*tempT + temp1*(       uz*(4.5f*uz       + 3.0f) - u2);
+		tempB = temp2*tempB + temp1*(       uz*(4.5f*uz       - 3.0f) - u2);
+		tempE = temp2*tempE + temp1*(       ux*(4.5f*ux       + 3.0f) - u2);
+		tempW = temp2*tempW + temp1*(       ux*(4.5f*ux       - 3.0f) - u2);
+		temp1 = DFL3*temp_base;
+		tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
+		tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
+		tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
+		tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
+		tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
+		tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
+		tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
+		tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
+		tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
+		tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
+		tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
+		tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
+	}
+
+	//Write the results computed above
+	//This is a scatter operation of the SCATTER preprocessor variable
+        // is defined in layout_config.h, or a "local" write otherwise
+	DST_C ( dstGrid ) = tempC;
+
+	DST_N ( dstGrid ) = tempN; 
+	DST_S ( dstGrid ) = tempS;
+	DST_E ( dstGrid ) = tempE;
+	DST_W ( dstGrid ) = tempW;
+	DST_T ( dstGrid ) = tempT;
+	DST_B ( dstGrid ) = tempB;
+
+	DST_NE( dstGrid ) = tempNE;
+	DST_NW( dstGrid ) = tempNW;
+	DST_SE( dstGrid ) = tempSE;
+	DST_SW( dstGrid ) = tempSW;
+	DST_NT( dstGrid ) = tempNT;
+	DST_NB( dstGrid ) = tempNB;
+	DST_ST( dstGrid ) = tempST;
+	DST_SB( dstGrid ) = tempSB;
+	DST_ET( dstGrid ) = tempET;
+	DST_EB( dstGrid ) = tempEB;
+	DST_WT( dstGrid ) = tempWT;
+	DST_WB( dstGrid ) = tempWB;
+}
+
+#endif // LBM_KERNEL_CL
diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.nvptx.s b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.nvptx.s
new file mode 100644
index 0000000000..7e130799aa
--- /dev/null
+++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/kernel_offline.nvptx.s
@@ -0,0 +1,261 @@
+//
+// Generated by LLVM NVPTX Back-End
+//
+
+.version 3.1
+.target sm_20, texmode_independent
+.address_size 32
+
+	// .globl	performStreamCollide_kernel
+
+.entry performStreamCollide_kernel(
+	.param .u32 .ptr .global .align 4 performStreamCollide_kernel_param_0,
+	.param .u32 .ptr .global .align 4 performStreamCollide_kernel_param_1
+)
+{
+	.reg .pred 	%p<3>;
+	.reg .f32 	%f<197>;
+	.reg .s32 	%r<38>;
+
+	ld.param.u32 	%r5, [performStreamCollide_kernel_param_0];
+	mov.u32	%r6, %tid.x;
+	ld.param.u32 	%r4, [performStreamCollide_kernel_param_1];
+	mov.u32	%r7, %ctaid.x;
+	mov.u32	%r8, %ctaid.y;
+	shl.b32 	%r9, %r7, 7;
+	add.s32 	%r10, %r9, %r6;
+	mad.lo.s32 	%r1, %r8, 15360, %r10;
+	add.s32 	%r2, %r1, 30720;
+	shl.b32 	%r11, %r2, 2;
+	add.s32 	%r12, %r5, %r11;
+	ld.global.f32 	%f1, [%r12];
+	shl.b32 	%r13, %r1, 2;
+	add.s32 	%r14, %r13, %r5;
+	ld.global.f32 	%f2, [%r14+9584128];
+	ld.global.f32 	%f3, [%r14+19046912];
+	add.s32 	%r15, %r10, -1;
+	mad.lo.s32 	%r16, %r8, 15360, %r15;
+	shl.b32 	%r17, %r16, 2;
+	add.s32 	%r18, %r17, %r5;
+	ld.global.f32 	%f4, [%r18+28508160];
+	add.s32 	%r19, %r10, 1;
+	mad.lo.s32 	%r20, %r8, 15360, %r19;
+	shl.b32 	%r21, %r20, 2;
+	add.s32 	%r22, %r21, %r5;
+	ld.global.f32 	%f5, [%r22+37969920];
+	ld.global.f32 	%f6, [%r14+47370240];
+	mad.lo.s32 	%r23, %r8, 15360, 15360;
+	add.s32 	%r24, %r10, %r23;
+	shl.b32 	%r25, %r24, 2;
+	add.s32 	%r26, %r25, %r5;
+	ld.global.f32 	%f7, [%r26+56893440];
+	ld.global.f32 	%f8, [%r14+66354684];
+	ld.global.f32 	%f9, [%r14+75816452];
+	ld.global.f32 	%f10, [%r14+85279228];
+	ld.global.f32 	%f11, [%r14+94740996];
+	ld.global.f32 	%f12, [%r14+104140288];
+	ld.global.f32 	%f13, [%r14+113724928];
+	ld.global.f32 	%f14, [%r14+123064832];
+	ld.global.f32 	%f15, [%r14+132649472];
+	ld.global.f32 	%f16, [%r14+141987836];
+	add.s32 	%r27, %r15, %r23;
+	shl.b32 	%r28, %r27, 2;
+	add.s32 	%r29, %r28, %r5;
+	ld.global.f32 	%f17, [%r29+151511040];
+	ld.global.f32 	%f18, [%r14+160911364];
+	add.s32 	%r30, %r19, %r23;
+	shl.b32 	%r31, %r30, 2;
+	add.s32 	%r32, %r31, %r5;
+	ld.global.f32 	%f19, [%r32+170434560];
+	ld.global.u32 	%r3, [%r14+179896320];
+	{
+	.reg .b32 temp;
+	and.b32	 temp, %r3, 1;
+	setp.b32.eq 	 %p1, temp, 1;
+	}
+	@%p1 bra 	BB0_2;
+	add.f32 	%f58, %f1, %f2;
+	add.f32 	%f59, %f58, %f3;
+	add.f32 	%f60, %f59, %f4;
+	add.f32 	%f61, %f60, %f5;
+	add.f32 	%f62, %f61, %f6;
+	add.f32 	%f63, %f62, %f7;
+	add.f32 	%f64, %f63, %f8;
+	add.f32 	%f65, %f64, %f9;
+	add.f32 	%f66, %f65, %f10;
+	add.f32 	%f67, %f66, %f11;
+	add.f32 	%f68, %f67, %f12;
+	add.f32 	%f69, %f68, %f13;
+	add.f32 	%f70, %f69, %f14;
+	add.f32 	%f71, %f70, %f15;
+	add.f32 	%f72, %f71, %f16;
+	add.f32 	%f73, %f72, %f17;
+	add.f32 	%f74, %f73, %f18;
+	add.f32 	%f75, %f74, %f19;
+	sub.f32 	%f76, %f4, %f5;
+	add.f32 	%f77, %f76, %f8;
+	sub.f32 	%f78, %f77, %f9;
+	add.f32 	%f79, %f78, %f10;
+	sub.f32 	%f80, %f79, %f11;
+	add.f32 	%f81, %f80, %f16;
+	add.f32 	%f82, %f81, %f17;
+	sub.f32 	%f83, %f82, %f18;
+	sub.f32 	%f84, %f83, %f19;
+	sub.f32 	%f85, %f2, %f3;
+	add.f32 	%f86, %f85, %f8;
+	add.f32 	%f87, %f86, %f9;
+	sub.f32 	%f88, %f87, %f10;
+	sub.f32 	%f89, %f88, %f11;
+	add.f32 	%f90, %f89, %f12;
+	add.f32 	%f91, %f90, %f13;
+	sub.f32 	%f92, %f91, %f14;
+	sub.f32 	%f93, %f92, %f15;
+	sub.f32 	%f94, %f6, %f7;
+	add.f32 	%f95, %f94, %f12;
+	sub.f32 	%f96, %f95, %f13;
+	add.f32 	%f97, %f96, %f14;
+	sub.f32 	%f98, %f97, %f15;
+	add.f32 	%f99, %f98, %f16;
+	sub.f32 	%f100, %f99, %f17;
+	add.f32 	%f101, %f100, %f18;
+	sub.f32 	%f102, %f101, %f19;
+	div.rn.f32 	%f103, %f84, %f75;
+	div.rn.f32 	%f104, %f93, %f75;
+	div.rn.f32 	%f105, %f102, %f75;
+	and.b32  	%r33, %r3, 2;
+	setp.eq.s32 	%p2, %r33, 0;
+	selp.f32 	%f106, %f103, 0f3BA3D70A, %p2;
+	selp.f32 	%f107, %f104, 0f3B03126F, %p2;
+	selp.f32 	%f108, %f105, 0f00000000, %p2;
+	mul.f32 	%f109, %f107, %f107;
+	fma.rn.f32 	%f110, %f106, %f106, %f109;
+	fma.rn.f32 	%f111, %f108, %f108, %f110;
+	fma.rn.f32 	%f112, %f111, 0f3FC00000, 0fBF800000;
+	mul.f32 	%f113, %f75, 0f3FF9999A;
+	mul.f32 	%f114, %f113, 0fBEAAAAAB;
+	mul.f32 	%f115, %f114, %f112;
+	fma.rn.f32 	%f20, %f1, 0fBF733334, %f115;
+	mul.f32 	%f116, %f113, 0f3D638E39;
+	fma.rn.f32 	%f117, %f107, 0f40900000, 0f40400000;
+	neg.f32 	%f118, %f112;
+	fma.rn.f32 	%f119, %f107, %f117, %f118;
+	mul.f32 	%f120, %f116, %f119;
+	fma.rn.f32 	%f21, %f2, 0fBF733334, %f120;
+	fma.rn.f32 	%f121, %f107, 0f40900000, 0fC0400000;
+	fma.rn.f32 	%f122, %f107, %f121, %f118;
+	mul.f32 	%f123, %f116, %f122;
+	fma.rn.f32 	%f2, %f3, 0fBF733334, %f123;
+	fma.rn.f32 	%f124, %f108, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f125, %f108, %f124, %f118;
+	mul.f32 	%f126, %f116, %f125;
+	fma.rn.f32 	%f23, %f6, 0fBF733334, %f126;
+	fma.rn.f32 	%f127, %f108, 0f40900000, 0fC0400000;
+	fma.rn.f32 	%f128, %f108, %f127, %f118;
+	mul.f32 	%f129, %f116, %f128;
+	fma.rn.f32 	%f6, %f7, 0fBF733334, %f129;
+	fma.rn.f32 	%f130, %f106, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f131, %f106, %f130, %f118;
+	mul.f32 	%f132, %f116, %f131;
+	fma.rn.f32 	%f25, %f4, 0fBF733334, %f132;
+	fma.rn.f32 	%f133, %f106, 0f40900000, 0fC0400000;
+	fma.rn.f32 	%f134, %f106, %f133, %f118;
+	mul.f32 	%f135, %f116, %f134;
+	fma.rn.f32 	%f4, %f5, 0fBF733334, %f135;
+	mul.f32 	%f136, %f113, 0f3CE38E39;
+	add.f32 	%f137, %f107, %f108;
+	fma.rn.f32 	%f138, %f137, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f139, %f137, %f138, %f118;
+	mul.f32 	%f140, %f136, %f139;
+	fma.rn.f32 	%f27, %f12, 0fBF733334, %f140;
+	sub.f32 	%f141, %f107, %f108;
+	fma.rn.f32 	%f142, %f141, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f143, %f141, %f142, %f118;
+	mul.f32 	%f144, %f136, %f143;
+	fma.rn.f32 	%f28, %f13, 0fBF733334, %f144;
+	neg.f32 	%f145, %f107;
+	sub.f32 	%f146, %f108, %f107;
+	fma.rn.f32 	%f147, %f146, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f148, %f146, %f147, %f118;
+	mul.f32 	%f149, %f136, %f148;
+	fma.rn.f32 	%f13, %f14, 0fBF733334, %f149;
+	sub.f32 	%f150, %f145, %f108;
+	fma.rn.f32 	%f151, %f150, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f152, %f150, %f151, %f118;
+	mul.f32 	%f153, %f136, %f152;
+	fma.rn.f32 	%f12, %f15, 0fBF733334, %f153;
+	add.f32 	%f154, %f106, %f107;
+	fma.rn.f32 	%f155, %f154, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f156, %f154, %f155, %f118;
+	mul.f32 	%f157, %f136, %f156;
+	fma.rn.f32 	%f31, %f8, 0fBF733334, %f157;
+	sub.f32 	%f158, %f106, %f107;
+	fma.rn.f32 	%f159, %f158, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f160, %f158, %f159, %f118;
+	mul.f32 	%f161, %f136, %f160;
+	fma.rn.f32 	%f32, %f10, 0fBF733334, %f161;
+	add.f32 	%f162, %f106, %f108;
+	fma.rn.f32 	%f163, %f162, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f164, %f162, %f163, %f118;
+	mul.f32 	%f165, %f136, %f164;
+	fma.rn.f32 	%f33, %f16, 0fBF733334, %f165;
+	sub.f32 	%f166, %f106, %f108;
+	fma.rn.f32 	%f167, %f166, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f168, %f166, %f167, %f118;
+	mul.f32 	%f169, %f136, %f168;
+	fma.rn.f32 	%f34, %f17, 0fBF733334, %f169;
+	neg.f32 	%f170, %f106;
+	sub.f32 	%f171, %f107, %f106;
+	fma.rn.f32 	%f172, %f171, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f173, %f171, %f172, %f118;
+	mul.f32 	%f174, %f136, %f173;
+	fma.rn.f32 	%f10, %f9, 0fBF733334, %f174;
+	sub.f32 	%f175, %f170, %f107;
+	fma.rn.f32 	%f176, %f175, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f177, %f175, %f176, %f118;
+	mul.f32 	%f178, %f136, %f177;
+	fma.rn.f32 	%f8, %f11, 0fBF733334, %f178;
+	sub.f32 	%f179, %f108, %f106;
+	fma.rn.f32 	%f180, %f179, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f181, %f179, %f180, %f118;
+	mul.f32 	%f182, %f136, %f181;
+	fma.rn.f32 	%f17, %f18, 0fBF733334, %f182;
+	sub.f32 	%f183, %f170, %f108;
+	fma.rn.f32 	%f184, %f183, 0f40900000, 0f40400000;
+	fma.rn.f32 	%f185, %f183, %f184, %f118;
+	mul.f32 	%f186, %f136, %f185;
+	fma.rn.f32 	%f16, %f19, 0fBF733334, %f186;
+	mov.f32 	%f5, %f25;
+	mov.f32 	%f7, %f23;
+	mov.f32 	%f11, %f31;
+	mov.f32 	%f9, %f32;
+	mov.f32 	%f15, %f27;
+	mov.f32 	%f14, %f28;
+	mov.f32 	%f19, %f33;
+	mov.f32 	%f18, %f34;
+	mov.f32 	%f3, %f21;
+	mov.f32 	%f1, %f20;
+BB0_2:
+	add.s32 	%r35, %r4, %r11;
+	st.global.f32 	[%r35], %f1;
+	add.s32 	%r37, %r13, %r4;
+	st.global.f32 	[%r37+9584640], %f3;
+	st.global.f32 	[%r37+19046400], %f2;
+	st.global.f32 	[%r37+28508160], %f5;
+	st.global.f32 	[%r37+37969920], %f4;
+	st.global.f32 	[%r37+47431680], %f7;
+	st.global.f32 	[%r37+56893440], %f6;
+	st.global.f32 	[%r37+66355200], %f11;
+	st.global.f32 	[%r37+75816960], %f10;
+	st.global.f32 	[%r37+85278720], %f9;
+	st.global.f32 	[%r37+94740480], %f8;
+	st.global.f32 	[%r37+104202240], %f15;
+	st.global.f32 	[%r37+113664000], %f14;
+	st.global.f32 	[%r37+123125760], %f13;
+	st.global.f32 	[%r37+132587520], %f12;
+	st.global.f32 	[%r37+142049280], %f19;
+	st.global.f32 	[%r37+151511040], %f18;
+	st.global.f32 	[%r37+160972800], %f17;
+	st.global.f32 	[%r37+170434560], %f16;
+	ret;
+}
+
diff --git a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
index 433a17c299..128acb0298 100644
--- a/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
+++ b/llvm/test/VISC/parboil/benchmarks/lbm/src/opencl_nvidia/main.c
@@ -28,205 +28,207 @@ static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
 
 struct pb_TimerSet timers;
 int main( int nArgs, char* arg[] ) {
-	MAIN_Param param;
-	int t;
-
-	OpenCL_Param prm;
-
-	pb_InitializeTimerSet(&timers);
-        struct pb_Parameters* params;
-        params = pb_ReadParameters(&nArgs, arg);
-        
-
-	static LBM_GridPtr TEMP_srcGrid;
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	MAIN_parseCommandLine( nArgs, arg, &param, params );
-	MAIN_printInfo( &param );
-
-        pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-	OpenCL_initialize(&prm);
-	MAIN_initialize( &param, &prm );
-	
-	for( t = 1; t <= param.nTimeSteps; t++ ) {
-                pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-		OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
-                pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-		LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
-
-		if( (t & 63) == 0 ) {
-			printf( "timestep: %i\n", t );
+    MAIN_Param param;
+    int t;
+
+    OpenCL_Param prm;
+
+    pb_InitializeTimerSet(&timers);
+    struct pb_Parameters* params;
+    params = pb_ReadParameters(&nArgs, arg);
+
+
+    static LBM_GridPtr TEMP_srcGrid;
+    //Setup TEMP datastructures
+    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+    MAIN_parseCommandLine( nArgs, arg, &param, params );
+    MAIN_printInfo( &param );
+
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    OpenCL_initialize(&prm);
+    MAIN_initialize( &param, &prm );
+
+    for( t = 1; t <= param.nTimeSteps; t++ ) {
+        pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
+        OpenCL_LBM_performStreamCollide( &prm, OpenCL_srcGrid, OpenCL_dstGrid );
+        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+        LBM_swapGrids( &OpenCL_srcGrid, &OpenCL_dstGrid );
+
+        if( (t & 63) == 0 ) {
+            printf( "timestep: %i\n", t );
 #if 0
-			CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
-			LBM_showGridStatistics( *TEMP_srcGrid );
+            CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
+            LBM_showGridStatistics( *TEMP_srcGrid );
 #endif
-		}
-	}
-	
-	MAIN_finalize( &param, &prm );
+        }
+    }
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
+    MAIN_finalize( &param, &prm );
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_freeGrid( (float**) &TEMP_srcGrid );
 
-        pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-        pb_PrintTimerSet(&timers);
-        pb_FreeParameters(params);
-	return 0;
+    pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+    pb_PrintTimerSet(&timers);
+    pb_FreeParameters(params);
+    return 0;
 }
 
 /*############################################################################*/
 
 void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters * params ) {
-	struct stat fileStat;
-
-	if( nArgs < 2 ) {
-		printf( "syntax: lbm <time steps>\n" );
-		exit( 1 );
-	}
-
-	param->nTimeSteps     = atoi( arg[1] );
-
-	if( params->inpFiles[0] != NULL ) {
-		param->obstacleFilename = params->inpFiles[0];
-
-		if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
-			printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
-					param->obstacleFilename );
-			exit( 1 );
-		}
-		if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
-			printf( "MAIN_parseCommandLine:\n"
-					"\tsize of file '%s' is %i bytes\n"
-					"\texpected size is %i bytes\n",
-					param->obstacleFilename, (int) fileStat.st_size,
-					SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
-			exit( 1 );
-		}
-	}
-	else param->obstacleFilename = NULL;
-
-        param->resultFilename = params->outFile;
+    struct stat fileStat;
+
+    if( nArgs < 2 ) {
+        printf( "syntax: lbm <time steps>\n" );
+        exit( 1 );
+    }
+
+    param->nTimeSteps     = atoi( arg[1] );
+
+    if( params->inpFiles[0] != NULL ) {
+        param->obstacleFilename = params->inpFiles[0];
+
+        if( stat( param->obstacleFilename, &fileStat ) != 0 ) {
+            printf( "MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
+                    param->obstacleFilename );
+            exit( 1 );
+        }
+        if( fileStat.st_size != SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z ) {
+            printf( "MAIN_parseCommandLine:\n"
+                    "\tsize of file '%s' is %i bytes\n"
+                    "\texpected size is %i bytes\n",
+                    param->obstacleFilename, (int) fileStat.st_size,
+                    SIZE_X*SIZE_Y*SIZE_Z+(SIZE_Y+1)*SIZE_Z );
+            exit( 1 );
+        }
+    }
+    else param->obstacleFilename = NULL;
+
+    param->resultFilename = params->outFile;
 }
 
 /*############################################################################*/
 
 void MAIN_printInfo( const MAIN_Param* param ) {
-	printf( "MAIN_printInfo:\n"
-			"\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
-			"\tnTimeSteps     : %i\n"
-			"\tresult file    : %s\n"
-			"\taction         : %s\n"
-			"\tsimulation type: %s\n"
-			"\tobstacle file  : %s\n\n",
-			SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
-			param->nTimeSteps, param->resultFilename, 
-			"store", "lid-driven cavity",
-			(param->obstacleFilename == NULL) ? "<none>" :
-			param->obstacleFilename );
+    printf( "MAIN_printInfo:\n"
+            "\tgrid size      : %i x %i x %i = %.2f * 10^6 Cells\n"
+            "\tnTimeSteps     : %i\n"
+            "\tresult file    : %s\n"
+            "\taction         : %s\n"
+            "\tsimulation type: %s\n"
+            "\tobstacle file  : %s\n\n",
+            SIZE_X, SIZE_Y, SIZE_Z, 1e-6*SIZE_X*SIZE_Y*SIZE_Z,
+            param->nTimeSteps, param->resultFilename,
+            "store", "lid-driven cavity",
+            (param->obstacleFilename == NULL) ? "<none>" :
+            param->obstacleFilename );
 }
 
 /*############################################################################*/
 
 void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-	static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
-
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
-	LBM_allocateGrid( (float**) &TEMP_dstGrid );
-	LBM_initializeGrid( TEMP_srcGrid );
-	LBM_initializeGrid( TEMP_dstGrid );
-
-        pb_SwitchToTimer(&timers, pb_TimerID_IO);
-	if( param->obstacleFilename != NULL ) {
-		LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
-		LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
-	}
-        
-	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
-	LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
-	
-        pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-	//Setup DEVICE datastructures
-	OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
-	OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
-	
-	//Initialize DEVICE datastructures
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
-	OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
-	
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
-
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
-	LBM_freeGrid( (float**) &TEMP_dstGrid );
+    static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    //Setup TEMP datastructures
+    LBM_allocateGrid( (float**) &TEMP_srcGrid );
+    LBM_allocateGrid( (float**) &TEMP_dstGrid );
+    LBM_initializeGrid( TEMP_srcGrid );
+    LBM_initializeGrid( TEMP_dstGrid );
+
+    pb_SwitchToTimer(&timers, pb_TimerID_IO);
+    if( param->obstacleFilename != NULL ) {
+        LBM_loadObstacleFile( TEMP_srcGrid, param->obstacleFilename );
+        LBM_loadObstacleFile( TEMP_dstGrid, param->obstacleFilename );
+    }
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_initializeSpecialCellsForLDC( TEMP_srcGrid );
+    LBM_initializeSpecialCellsForLDC( TEMP_dstGrid );
+
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    //Setup DEVICE datastructures
+    OpenCL_LBM_allocateGrid( prm, &OpenCL_srcGrid );
+    OpenCL_LBM_allocateGrid( prm, &OpenCL_dstGrid );
+
+    //Initialize DEVICE datastructures
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    OpenCL_LBM_initializeGrid( prm, OpenCL_srcGrid, TEMP_srcGrid );
+    OpenCL_LBM_initializeGrid( prm, OpenCL_dstGrid, TEMP_dstGrid );
+
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_showGridStatistics( TEMP_srcGrid );
+
+    LBM_freeGrid( (float**) &TEMP_srcGrid );
+    LBM_freeGrid( (float**) &TEMP_dstGrid );
 }
 
 /*############################################################################*/
 
 void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm ) {
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_Grid TEMP_srcGrid;
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_Grid TEMP_srcGrid;
 
-	//Setup TEMP datastructures
-	LBM_allocateGrid( (float**) &TEMP_srcGrid );
+    //Setup TEMP datastructures
+    LBM_allocateGrid( (float**) &TEMP_srcGrid );
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-	OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
+    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
+    OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
 
-        pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-	LBM_showGridStatistics( TEMP_srcGrid );
+    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+    LBM_showGridStatistics( TEMP_srcGrid );
 
-	LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
+    LBM_storeVelocityField( TEMP_srcGrid, param->resultFilename, TRUE );
 
-	LBM_freeGrid( (float**) &TEMP_srcGrid );
+    LBM_freeGrid( (float**) &TEMP_srcGrid );
 
-        pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
-	OpenCL_LBM_freeGrid( OpenCL_srcGrid );
-	OpenCL_LBM_freeGrid( OpenCL_dstGrid );
+    pb_SwitchToTimer(&timers, visc_TimerID_SETUP);
+    OpenCL_LBM_freeGrid( OpenCL_srcGrid );
+    OpenCL_LBM_freeGrid( OpenCL_dstGrid );
 
-	clReleaseProgram(prm->clProgram);
-	clReleaseKernel(prm->clKernel);
-	clReleaseCommandQueue(prm->clCommandQueue);
-	clReleaseContext(prm->clContext);
+    clReleaseProgram(prm->clProgram);
+    clReleaseKernel(prm->clKernel);
+    clReleaseCommandQueue(prm->clCommandQueue);
+    clReleaseContext(prm->clContext);
 }
 
 void OpenCL_initialize(OpenCL_Param* prm)
 {
-	cl_int clStatus;
-	
-	clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL);
-	CHECK_ERROR("clGetPlatformIDs")
+    cl_int clStatus;
+
+    clStatus = clGetPlatformIDs(1,&(prm->clPlatform),NULL);
+    CHECK_ERROR("clGetPlatformIDs")
+
+    prm->clCps[0] = CL_CONTEXT_PLATFORM;
+    prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
+    prm->clCps[2] = 0;
+
+    clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL);
+    CHECK_ERROR("clGetDeviceIDs")
+
+    prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
+    CHECK_ERROR("clCreateContextFromType")
 
-	prm->clCps[0] = CL_CONTEXT_PLATFORM;
-	prm->clCps[1] = (cl_context_properties)(prm->clPlatform);
-	prm->clCps[2] = 0;
+    prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+    CHECK_ERROR("clCreateCommandQueue")
 
-	clStatus = clGetDeviceIDs(prm->clPlatform,CL_DEVICE_TYPE_GPU,1,&(prm->clDevice),NULL);
-	CHECK_ERROR("clGetDeviceIDs")
+    pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
 
-	prm->clContext = clCreateContextFromType(prm->clCps,CL_DEVICE_TYPE_GPU,NULL,NULL,&clStatus);
-	CHECK_ERROR("clCreateContextFromType")
+    //const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
+    //prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
+    //CHECK_ERROR("clCreateProgramWithSource")
 
-	prm->clCommandQueue = clCreateCommandQueue(prm->clContext,prm->clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-	CHECK_ERROR("clCreateCommandQueue")
+    //char clOptions[100];
+    //sprintf(clOptions,"-I src/opencl_nvidia");
 
-  	pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
+    //clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
+    //CHECK_ERROR("clBuildProgram")
 
-	const char* clSource[] = {readFile("src/opencl_nvidia/kernel.cl")};
-	prm->clProgram = clCreateProgramWithSource(prm->clContext,1,clSource,NULL,&clStatus);
-	CHECK_ERROR("clCreateProgramWithSource")
+    //prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
+    //CHECK_ERROR("clCreateKernel")
 
-	char clOptions[100];
-	sprintf(clOptions,"-I src/opencl_nvidia");
-		
-	clStatus = clBuildProgram(prm->clProgram,1,&(prm->clDevice),clOptions,NULL,NULL);
-	CHECK_ERROR("clBuildProgram")
-	
-	prm->clKernel = clCreateKernel(prm->clProgram,"performStreamCollide_kernel",&clStatus);
-	CHECK_ERROR("clCreateKernel")
+    //free((void*)clSource[0]);
 
-	free((void*)clSource[0]);
+    pb_CreateAndBuildKernelFromBinary("src/opencl_nvidia/kernel_offline.nvptx.s", "performStreamCollide_kernel", &prm->clContext, &prm->clDevice, &prm->clProgram, &prm->clKernel);
 }
-- 
GitLab