diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/Makefile b/llvm/test/VISC/gemm_opencl/matrixMul/Makefile index 816e85da5fed3c3df732151a7a21aa0a84fc4563..8426617b1812c901bf315424b5f2d7b5d9f164d5 100644 --- a/llvm/test/VISC/gemm_opencl/matrixMul/Makefile +++ b/llvm/test/VISC/gemm_opencl/matrixMul/Makefile @@ -1,24 +1,30 @@ -PASSES := +PASSES := .PHONY: clean -LIBCLC:=/home/kotsifa2/llvm/libclc -HOST:=sgemm +LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install +LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc +HOST:=gemm_opencl KERNELS:=matrixMul +LLVM_CC:=$(LLVM_INSTALL)/bin/clang +LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link -all: $(KERNELS:%=%.ll) $(HOST:%=%.ll) +all: $(KERNELS:%=%.nvptx.s) $(HOST:%=%.ll) $(HOST:%=%.bin) $(KERNELS:%=%.ll):%.ll:%.cl - clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ + $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ $(KERNELS:%=%.linked.bc):%.linked.bc:%.ll - llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ + $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ $(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc - clang -target nvptx $< -S -o $@ + $(LLVM_CC) -O3 -target nvptx $< -S -o $@ $(HOST:%=%.ll):%.ll:%.c - clang -O3 -S -emit-llvm $< -o $@ + $(LLVM_CC) -O3 -S -emit-llvm -I /usr/local/cuda/include $< -o $@ + +$(HOST:%=%.bin):%.bin:%.c + $(LLVM_CC) -O3 -lOpenCL -I /usr/local/cuda/include $< -o $@ clean : - rm -f *.ll *.bc *.s + rm -f *.ll *.bc *.s *.bin diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c b/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c index 9b83609cc0d4117f9b333e4ada759c4854cd17bb..89c535add07650ba36e26c45244a420f2ef2801f 100644 --- a/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c +++ b/llvm/test/VISC/gemm_opencl/matrixMul/gemm_opencl.c @@ -1,7 +1,8 @@ #include <stdlib.h> #include <stdio.h> #include <math.h> -#include <oclUtils.h> +#include <string.h> +#include <CL/cl.h> #define WA 1024 #define HA 1024 @@ -10,10 +11,12 @@ #define WC WB #define HC HA + + // Thread block size #define BLOCK_SIZE 16 -inline void checkErr(cl_int err, cl_int success, const char * name) { +static inline void checkErr(cl_int err, cl_int success, const char * name) { if (err != success) { fprintf(stderr, "ERROR: %s\n", name); exit(EXIT_FAILURE); @@ -25,13 +28,93 @@ void randomInit(float* data, int size) { for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX; } - + +////////////////////////////////////////////////////////////////////////////// +//! Loads a Program file. +//! +//! @return the source string if succeeded, 0 otherwise +//! @param cFilename program filename +//! @param szFinalLength returned length of the code string +////////////////////////////////////////////////////////////////////////////// +char* LoadProgSource(const char* cFilename, size_t* szFinalLength) +{ + // locals + FILE* pFileStream = NULL; + size_t szSourceLength; + + // open the OpenCL source code file + #ifdef _WIN32 // Windows version + if(fopen_s(&pFileStream, cFilename, "rb") != 0) + { + return NULL; + } + #else // Linux version + pFileStream = fopen(cFilename, "rb"); + if(pFileStream == 0) + { + return NULL; + } + #endif + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char* cSourceString = (char *)malloc(szSourceLength + 1); + if (fread((cSourceString), szSourceLength, 1, pFileStream) != 1) + { + fclose(pFileStream); + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + source) string + fclose(pFileStream); + if(szFinalLength != 0) + { + *szFinalLength = szSourceLength; + } + cSourceString[szSourceLength] = '\0'; + + return cSourceString; +} + +// Check bool +int isEqual(float a, float b) { + return (fabs(a-b) < 0.001); +} + +// Check Results + +int checkResults(float* A, float* B, float* C) { + unsigned int size_A = WA * HA; + unsigned int size_B = WB * HB; + unsigned int size_C = WC * HC; + unsigned int bytesC = sizeof(float) * size_C; + float* goldC = (float*) malloc(bytesC); + for (int i=0; i < HC; i++) { + for (int j=0; j < WC; j++) { + goldC[i*WC + j] = 0; + for (int k=0; k < HB; k++) { + goldC[i*WC + j] += A[i*WA + k] * B[k*WB + j]; + } + if(!isEqual(goldC[i*WC + j], C[i*WC + j])) { + printf("Mismatch at %d,%d --- C = %f and goldC = %f\n", i, j, C[i*WC+j], goldC[i*WC+j]); + return 0; + } + } + } + return 1; // Success +} + // Main int main(int argc, char** argv) { // seed for rand() srand(2006); - + // Allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int bytes_A = sizeof(float) * size_A; @@ -40,12 +123,12 @@ int main(int argc, char** argv) { unsigned int size_B = WB * HB; unsigned int bytes_B = sizeof(float) * size_B; float* h_B = (float*) malloc(bytes_B); - + // Initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); -/* +/* // Print A and B printf("\n\nMatrix A\n"); for(int i = 0; i < size_A; i++) @@ -54,7 +137,7 @@ int main(int argc, char** argv) { if(((i + 1) % WA) == 0) printf("\n"); } - + printf("\n\nMatrix B\n"); for(int i = 0; i < size_B; i++) { @@ -68,7 +151,7 @@ int main(int argc, char** argv) { unsigned int size_C = WC * HC; unsigned int bytes_C = sizeof(float) * size_C; float* h_C = (float*) malloc(bytes_C); - + // Initialize OpenCL // OpenCL specific variables @@ -76,7 +159,7 @@ int main(int argc, char** argv) { cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; - + size_t dataBytes; size_t kernelLength; cl_int errcode; @@ -89,47 +172,86 @@ int main(int argc, char** argv) { /*****************************************/ /* Initialize OpenCL */ /*****************************************/ - clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, + // query the number of platforms + cl_uint numPlatforms; + errcode = clGetPlatformIDs(0, NULL, &numPlatforms); + checkErr(errcode, CL_SUCCESS, "Failure to get number of platforms"); + + // now get all the platform IDs + cl_platform_id platforms[numPlatforms]; + errcode = clGetPlatformIDs(numPlatforms, platforms, NULL); + checkErr(errcode, CL_SUCCESS, "Failure to get platform IDs"); + + for(unsigned i=0; i < numPlatforms; i++) { + char buffer[10240]; + printf(" -- %d --\n", i); + clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL); + printf(" PROFILE = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL); + printf(" VERSION = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL); + printf(" NAME = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL); + printf(" VENDOR = %s\n", buffer); + clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL); + printf(" EXTENSIONS = %s\n", buffer); + } + // set platform property - just pick the first one + cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, + (int) platforms[0], + 0}; + clGPUContext = clCreateContextFromType(properties, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); - shrCheckError(errcode, CL_SUCCESS); + checkErr(errcode, CL_SUCCESS, "Failure to create GPU context"); // get the list of GPU devices associated with context errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); - errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, + errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); - shrCheckError(errcode, CL_SUCCESS); + checkErr(errcode, CL_SUCCESS, "Failure to get context info"); //Create a command-queue clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], 0, &errcode); - shrCheckError(errcode, CL_SUCCESS); - + checkErr(errcode, CL_SUCCESS, "Failure to create command queue"); + // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, bytes_C, NULL, &errcode); - d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, bytes_A, h_A, &errcode); - d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, bytes_B, h_B, &errcode); - - + + // Load and build OpenCL kernel - char *clMatrixMul = oclLoadProgSource("kernel.cl", - "// My comment\n", - &kernelLength); - shrCheckError(clMatrixMul != NULL, shrTRUE); - - clProgram = clCreateProgramWithSource(clGPUContext, 1, - (const char **)&clMatrixMul, + /*char *clMatrixMul = LoadProgSource("matrixMul.cl", + "// My comment\n", + &kernelLength);*/ + //checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program"); + + /*clProgram = clCreateProgramWithSource(clGPUContext, 1, + (const char **)&clMatrixMul, &kernelLength, &errcode); - shrCheckError(errcode, CL_SUCCESS); + checkErr(errcode, CL_SUCCESS, "Failure to create program from source"); +*/ + size_t binaryLength; + char *clMatrixMul = LoadProgSource("matrixMul.nvptx.s", &binaryLength); + checkErr(clMatrixMul != NULL, 1 /*bool true*/, "Failure to load Program Binary"); + + cl_int binaryStatus; + clProgram = clCreateProgramWithBinary(clGPUContext, 1, &clDevices[0], + &binaryLength, + (const unsigned char **)&clMatrixMul, + &binaryStatus, &errcode); + checkErr(errcode, CL_SUCCESS, "Failure to create program from binary"); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); - + checkErr(errcode, CL_SUCCESS, "Failure to build program"); + clKernel = clCreateKernel(clProgram, "matrixMul", &errcode); - shrCheckError(errcode, CL_SUCCESS); + checkErr(errcode, CL_SUCCESS, "Failure to create kernel"); // Launch OpenCL kernel @@ -142,7 +264,7 @@ int main(int argc, char** argv) { errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); - shrCheckError(errcode, CL_SUCCESS); + checkErr(errcode, CL_SUCCESS, "Failure to set kernel arguments"); localWorkSize[0] = BLOCK_SIZE; localWorkSize[1] = BLOCK_SIZE; @@ -152,12 +274,12 @@ int main(int argc, char** argv) { errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); + checkErr(errcode, CL_SUCCESS, "Failure to enqueue kernel"); // Retrieve result from device errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, bytes_C, h_C, 0, NULL, NULL); - shrCheckError(errcode, CL_SUCCESS); + checkErr(errcode, CL_SUCCESS, "Failure to read buffer"); // Print out the result /* @@ -169,6 +291,10 @@ int main(int argc, char** argv) { } printf("\n"); */ + if(checkResults(h_A, h_B, h_C)) + printf("\nPass!\n"); + else + printf("\nFailed!\n"); printf("\nDone!\n"); // Deallocate memory diff --git a/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl b/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl index d3926a9f9f1bbc4d41e473ea8851ea3bc520e878..936d354542da6c36400903238600f698f0087138 100644 --- a/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl +++ b/llvm/test/VISC/gemm_opencl/matrixMul/matrixMul.cl @@ -18,7 +18,6 @@ __kernel void matrixMul(__global float* C, for (int i = 0; i < k; i++) { res += A[ty*k+i] * B[i*n+tx]; } - // Write in device memory C[ty*n+tx] = res; diff --git a/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile b/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile index f4c3d9b296b18baa1e60b58c705878e1a256a3cb..1984e14c78f3fabd3f2b92a98b81919dbaf2b979 100644 --- a/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile +++ b/llvm/test/VISC/gemm_opencl/matrixMul_bc/Makefile @@ -1,24 +1,27 @@ -PASSES := +PASSES := .PHONY: clean -LIBCLC:=/home/kotsifa2/llvm/libclc +LLVM_INSTALL:=/home/psrivas2/Hetero/VISC/Code/trunk/llvm-install +LIBCLC:=/home/psrivas2/Hetero/VISC/Code/trunk/libclc HOST:=sgemm KERNELS:=matrixMul_bc +LLVM_CC:=$(LLVM_INSTALL)/bin/clang +LLVM_LINK:=$(LLVM_INSTALL)/bin/llvm-link all: $(KERNELS:%=%.ll) $(HOST:%=%.ll) $(KERNELS:%=%.ll):%.ll:%.cl - clang -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ + $(LLVM_CC) -Dcl_clang_storage_class_specifiers -isystem $(LIBCLC)/generic/include -include clc/clc.h -target nvptx--nvidiacl -xcl $< -O3 -emit-llvm -S -o $@ $(KERNELS:%=%.linked.bc):%.linked.bc:%.ll - llvm-link $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ + $(LLVM_LINK) $(LIBCLC)/built_libs/nvptx--nvidiacl.bc $< -o $@ $(KERNELS:%=%.nvptx.s):%.nvptx.s:%.linked.bc - clang -target nvptx $< -S -o $@ + $(LLVM_CC) -target nvptx $< -S -o $@ $(HOST:%=%.ll):%.ll:%.c - clang -O3 -S -emit-llvm $< -o $@ + $(LLVM_CC) -O3 -S -emit-llvm $< -o $@ clean : rm -f *.ll *.bc *.s diff --git a/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c b/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c index 9b83609cc0d4117f9b333e4ada759c4854cd17bb..31cd7502ea360592ea845c9705de2568f35d6de9 100644 --- a/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c +++ b/llvm/test/VISC/gemm_opencl/matrixMul_bc/gemm_opencl.c @@ -1,7 +1,8 @@ #include <stdlib.h> #include <stdio.h> #include <math.h> -#include <oclUtils.h> +//#include <oclUtils.h> +#include <CL/cl.h> #define WA 1024 #define HA 1024 @@ -118,7 +119,7 @@ int main(int argc, char** argv) { char *clMatrixMul = oclLoadProgSource("kernel.cl", "// My comment\n", &kernelLength); - shrCheckError(clMatrixMul != NULL, shrTRUE); + shrCheckError(clMatrixMul != NULL, CL_SUCCESS); clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&clMatrixMul,