Skip to content
Snippets Groups Projects
Commit 4f1965c1 authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

Changes to Makefile and test case. matrixMul compiles correctly. It has been

changed to load a kernel binary (ptx) instead of kernel source
parent 0ff16a7f
No related branches found
No related tags found
No related merge requests found
PASSES := PASSES :=
.PHONY: clean .PHONY: clean
LIBCLC:=/home/kotsifa2/llvm/libclc LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
HOST:=sgemm LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
HOST:=gemm_opencl
KERNELS:=matrixMul KERNELS:=matrixMul
LLVM_CC:=$(LLVM_INSTALL)/bin/clang
LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
all: $(KERNELS:%=%.ll) $(HOST:%=%.ll) all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin)
$(KERNELS:%=%.ll):%.ll:%.cl $(KERNELS:%=%.ll):%.ll:%.cl
clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll $(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc $(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
clang -target nvptx $< -S -o $@ $(LLVM_CC) -O3 -target nvptx $< -S -o $@
$(HOST:%=%.ll):%.ll:%.c $(HOST:%=%.ll):%.ll:%.c
clang -O3 -S -emit-llvm $< -o $@ $(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@
$(HOST:%=%.bin):%.bin:%.c
$(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@
clean : clean :
rm -f *.ll *.bc *.s rm -f *.ll *.bc *.s *.bin
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <math.h> #include <math.h>
#include <oclUtils.h> #include <string.h>
#include <CL/cl.h>
#define WA 1024 #define WA 1024
#define HA 1024 #define HA 1024
...@@ -10,10 +11,12 @@ ...@@ -10,10 +11,12 @@
#define WC WB #define WC WB
#define HC HA #define HC HA
// Thread block size // Thread block size
#define BLOCK_SIZE 16 #define BLOCK_SIZE 16
inline void checkErr(cl_int err, cl_int success, const char * name) { static inline void checkErr(cl_int err, cl_int success, const char * name) {
if (err != success) { if (err != success) {
fprintf(stderr, "ERROR: %s\n", name); fprintf(stderr, "ERROR: %s\n", name);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
...@@ -25,13 +28,93 @@ void randomInit(float* data, int size) { ...@@ -25,13 +28,93 @@ void randomInit(float* data, int size) {
for (int i = 0; i < size; ++i) for (int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX; data[i] = rand() / (float)RAND_MAX;
} }
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
char* LoadProgSource(const char* cFilename, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + 1);
if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength;
}
cSourceString[szSourceLength] = '\0';
return cSourceString;
}
// Check bool
int isEqual(float a, float b) {
return (fabs(a-b) < 0.001);
}
// Check Results
int checkResults(float* A, float* B, float* C) {
unsigned int size_A = WA * HA;
unsigned int size_B = WB * HB;
unsigned int size_C = WC * HC;
unsigned int bytesC = sizeof(float) * size_C;
float* goldC = (float*) malloc(bytesC);
for (int i=0; i < HC; i++) {
for (int j=0; j < WC; j++) {
goldC[i*WC + j] = 0;
for (int k=0; k < HB; k++) {
goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j];
}
if(!isEqual(goldC[i*WC + j], C[i*WC + j])) {
printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]);
return 0;
}
}
}
return 1; // Success
}
// Main // Main
int main(int argc, char** argv) { int main(int argc, char** argv) {
// seed for rand() // seed for rand()
srand(2006); srand(2006);
// Allocate host memory for matrices A and B // Allocate host memory for matrices A and B
unsigned int size_A = WA * HA; unsigned int size_A = WA * HA;
unsigned int bytes_A = sizeof(float) * size_A; unsigned int bytes_A = sizeof(float) * size_A;
...@@ -40,12 +123,12 @@ int main(int argc, char** argv) { ...@@ -40,12 +123,12 @@ int main(int argc, char** argv) {
unsigned int size_B = WB * HB; unsigned int size_B = WB * HB;
unsigned int bytes_B = sizeof(float) * size_B; unsigned int bytes_B = sizeof(float) * size_B;
float* h_B = (float*) malloc(bytes_B); float* h_B = (float*) malloc(bytes_B);
// Initialize host memory // Initialize host memory
randomInit(h_A, size_A); randomInit(h_A, size_A);
randomInit(h_B, size_B); randomInit(h_B, size_B);
/* /*
// Print A and B // Print A and B
printf("\n\nMatrix A\n"); printf("\n\nMatrix A\n");
for(int i = 0; i < size_A; i++) for(int i = 0; i < size_A; i++)
...@@ -54,7 +137,7 @@ int main(int argc, char** argv) { ...@@ -54,7 +137,7 @@ int main(int argc, char** argv) {
if(((i + 1) % WA) == 0) if(((i + 1) % WA) == 0)
printf("\n"); printf("\n");
} }
printf("\n\nMatrix B\n"); printf("\n\nMatrix B\n");
for(int i = 0; i < size_B; i++) for(int i = 0; i < size_B; i++)
{ {
...@@ -68,7 +151,7 @@ int main(int argc, char** argv) { ...@@ -68,7 +151,7 @@ int main(int argc, char** argv) {
unsigned int size_C = WC * HC; unsigned int size_C = WC * HC;
unsigned int bytes_C = sizeof(float) * size_C; unsigned int bytes_C = sizeof(float) * size_C;
float* h_C = (float*) malloc(bytes_C); float* h_C = (float*) malloc(bytes_C);
// Initialize OpenCL // Initialize OpenCL
// OpenCL specific variables // OpenCL specific variables
...@@ -76,7 +159,7 @@ int main(int argc, char** argv) { ...@@ -76,7 +159,7 @@ int main(int argc, char** argv) {
cl_command_queue clCommandQue; cl_command_queue clCommandQue;
cl_program clProgram; cl_program clProgram;
cl_kernel clKernel; cl_kernel clKernel;
size_t dataBytes; size_t dataBytes;
size_t kernelLength; size_t kernelLength;
cl_int errcode; cl_int errcode;
...@@ -89,47 +172,86 @@ int main(int argc, char** argv) { ...@@ -89,47 +172,86 @@ int main(int argc, char** argv) {
/*****************************************/ /*****************************************/
/* Initialize OpenCL */ /* Initialize OpenCL */
/*****************************************/ /*****************************************/
clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, // query the number of platforms
cl_uint numPlatforms;
errcode = clGetPlatformIDs(0, NULL, &numPlatforms);
checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms");
// now get all the platform IDs
cl_platform_id platforms[numPlatforms];
errcode = clGetPlatformIDs(numPlatforms, platforms, NULL);
checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs");
for(unsigned i=0; i < numPlatforms; i++) {
char buffer[10240];
printf(" -- %d --\n", i);
clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL);
printf(" PROFILE = %s\n", buffer);
clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL);
printf(" VERSION = %s\n", buffer);
clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL);
printf(" NAME = %s\n", buffer);
clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL);
printf(" VENDOR = %s\n", buffer);
clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
printf(" EXTENSIONS = %s\n", buffer);
}
// set platform property - just pick the first one
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM,
(int) platforms[0],
0};
clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU,
NULL, NULL, &errcode); NULL, NULL, &errcode);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to create GPU context");
// get the list of GPU devices associated with context // get the list of GPU devices associated with context
errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0,
NULL, &dataBytes); NULL, &dataBytes);
cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes);
errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes,
clDevices, NULL); clDevices, NULL);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to get context info");
//Create a command-queue //Create a command-queue
clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to create command queue");
// Setup device memory // Setup device memory
d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL, d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL,
&errcode); &errcode);
d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
bytes_A, h_A, &errcode); bytes_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
bytes_B, h_B, &errcode); bytes_B, h_B, &errcode);
// Load and build OpenCL kernel // Load and build OpenCL kernel
char *clMatrixMul = oclLoadProgSource("kernel.cl", /*char *clMatrixMul = LoadProgSource("matrixMul.cl",
"// My comment\n", "// My comment\n",
&kernelLength); &kernelLength);*/
shrCheckError(clMatrixMul != NULL, shrTRUE); //checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program");
clProgram = clCreateProgramWithSource(clGPUContext, 1, /*clProgram = clCreateProgramWithSource(clGPUContext, 1,
(const char **)&clMatrixMul, (const char **)&clMatrixMul,
&kernelLength, &errcode); &kernelLength, &errcode);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to create program from source");
*/
size_t binaryLength;
char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &binaryLength);
checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary");
cl_int binaryStatus;
clProgram = clCreateProgramWithBinary(clGPUContext, 1, &clDevices[0],
&binaryLength,
(const unsigned char **)&clMatrixMul,
&binaryStatus, &errcode);
checkErr(errcode, CL_SUCCESS, "Failure to create program from binary");
errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to build program");
clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); clKernel = clCreateKernel(clProgram, "matrixMul", &errcode);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to create kernel");
// Launch OpenCL kernel // Launch OpenCL kernel
...@@ -142,7 +264,7 @@ int main(int argc, char** argv) { ...@@ -142,7 +264,7 @@ int main(int argc, char** argv) {
errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments");
localWorkSize[0] = BLOCK_SIZE; localWorkSize[0] = BLOCK_SIZE;
localWorkSize[1] = BLOCK_SIZE; localWorkSize[1] = BLOCK_SIZE;
...@@ -152,12 +274,12 @@ int main(int argc, char** argv) { ...@@ -152,12 +274,12 @@ int main(int argc, char** argv) {
errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL,
globalWorkSize, localWorkSize, globalWorkSize, localWorkSize,
0, NULL, NULL); 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel");
// Retrieve result from device // Retrieve result from device
errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C,
h_C, 0, NULL, NULL); h_C, 0, NULL, NULL);
shrCheckError(errcode, CL_SUCCESS); checkErr(errcode, CL_SUCCESS, "Failure to read buffer");
// Print out the result // Print out the result
/* /*
...@@ -169,6 +291,10 @@ int main(int argc, char** argv) { ...@@ -169,6 +291,10 @@ int main(int argc, char** argv) {
} }
printf("\n"); printf("\n");
*/ */
if(checkResults(h_A, h_B, h_C))
printf("\nPass!\n");
else
printf("\nFailed!\n");
printf("\nDone!\n"); printf("\nDone!\n");
// Deallocate memory // Deallocate memory
......
...@@ -18,7 +18,6 @@ __kernel void matrixMul(__global float* C, ...@@ -18,7 +18,6 @@ __kernel void matrixMul(__global float* C,
for (int i = 0; i < k; i++) { for (int i = 0; i < k; i++) {
res += A[ty*k+i] * B[i*n+tx]; res += A[ty*k+i] * B[i*n+tx];
} }
// Write in device memory // Write in device memory
C[ty*n+tx] = res; C[ty*n+tx] = res;
......
PASSES := PASSES :=
.PHONY: clean .PHONY: clean
LIBCLC:=/home/kotsifa2/llvm/libclc LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install
LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc
HOST:=sgemm HOST:=sgemm
KERNELS:=matrixMul_bc KERNELS:=matrixMul_bc
LLVM_CC:=$(LLVM_INSTALL)/bin/clang
LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link
all: $(KERNELS:%=%.ll) $(HOST:%=%.ll) all: $(KERNELS:%=%.ll) $(HOST:%=%.ll)
$(KERNELS:%=%.ll):%.ll:%.cl $(KERNELS:%=%.ll):%.ll:%.cl
clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@
$(KERNELS:%=%.linked.bc):%.linked.bc:%.ll $(KERNELS:%=%.linked.bc):%.linked.bc:%.ll
llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@
$(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc $(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc
clang -target nvptx $< -S -o $@ $(LLVM_CC) -target nvptx $< -S -o $@
$(HOST:%=%.ll):%.ll:%.c $(HOST:%=%.ll):%.ll:%.c
clang -O3 -S -emit-llvm $< -o $@ $(LLVM_CC) -O3 -S -emit-llvm $< -o $@
clean : clean :
rm -f *.ll *.bc *.s rm -f *.ll *.bc *.s
#include <stdlib.h> #include <stdlib.h>
#include <stdio.h> #include <stdio.h>
#include <math.h> #include <math.h>
#include <oclUtils.h> //#include <oclUtils.h>
#include <CL/cl.h>
#define WA 1024 #define WA 1024
#define HA 1024 #define HA 1024
...@@ -118,7 +119,7 @@ int main(int argc, char** argv) { ...@@ -118,7 +119,7 @@ int main(int argc, char** argv) {
char *clMatrixMul = oclLoadProgSource("kernel.cl", char *clMatrixMul = oclLoadProgSource("kernel.cl",
"// My comment\n", "// My comment\n",
&kernelLength); &kernelLength);
shrCheckError(clMatrixMul != NULL, shrTRUE); shrCheckError(clMatrixMul != NULL, CL_SUCCESS);
clProgram = clCreateProgramWithSource(clGPUContext, 1, clProgram = clCreateProgramWithSource(clGPUContext, 1,
(const char **)&clMatrixMul, (const char **)&clMatrixMul,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment