Skip to content
Snippets Groups Projects
Commit 2251d376 authored by Prakalp Srivastava's avatar Prakalp Srivastava
Browse files

Creating a new test case visc_mri-q. Initial commit

parent cd8e1fc5
No related branches found
No related tags found
No related merge requests found
......@@ -8,23 +8,88 @@
#include <stdio.h>
#include <malloc.h>
#include <CL/cl.h>
#include "ocl.h"
#include <math.h>
#include "macros.h"
#include "computeQ.h"
#define NC 4
void __attribute__ ((noinline)) computePhiMag_GPU_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) {
int indexK = get_global_id(0);
if (indexK < numK) {
float real = phiR[indexK];
float imag = phiI[indexK];
phiMag[indexK] = real*real + imag*imag;
}
}
void computePhiMag_GPU(int numK, float* phiR, float* phiI, float* phiMag)
{
int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
phiMagBlocks++;
size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
ComputePhiMag_GPU(phiR, phiI, phiMag, numK);
size_t bytes_phi = numK * sizeof(float);
computePhiMag_GPU_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK);
}
void __attribute__ ((noinline)) computeQ_GPU_kernel(int numK, int kGlobalIndex,
float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z,
float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck)
{
float sX[NC];
float sY[NC];
float sZ[NC];
float sQr[NC];
float sQi[NC];
#pragma unroll
for (int tx = 0; tx < NC; tx++) {
int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
sX[tx] = x[xIndex];
sY[tx] = y[xIndex];
sZ[tx] = z[xIndex];
sQr[tx] = Qr[xIndex];
sQi[tx] = Qi[xIndex];
}
// Loop over all elements of K in constant mem to compute a partial value
// for X.
int kIndex = 0;
for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
kIndex ++, kGlobalIndex ++) {
float kx = ck[kIndex].Kx;
float ky = ck[kIndex].Ky;
float kz = ck[kIndex].Kz;
float pm = ck[kIndex].PhiMag;
#pragma unroll
for (int tx = 0; tx < NC; tx++) {
float expArg = PIx2 *
(kx * sX[tx] +
ky * sY[tx] +
kz * sZ[tx]);
sQr[tx] += pm * cos(expArg);
sQi[tx] += pm * sin(expArg);
}
}
#pragma unroll
for (int tx = 0; tx < NC; tx++) {
int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
Qr[xIndex] = sQr[tx];
Qi[xIndex] = sQi[tx];
}
}
void computeQ_GPU (int numK,int numX,
......@@ -43,9 +108,8 @@ void computeQ_GPU (int numK,int numX,
size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
cl_int clStatus;
cl_mem ck;
ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
//ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
// size in bytes = numElems*sizeof(struct kValues))
int QGrid;
for (QGrid = 0; QGrid < QGrids; QGrid++) {
......@@ -53,28 +117,11 @@ void computeQ_GPU (int numK,int numX,
int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
struct kValues* kValsTile = kVals + QGridBase;
int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
size_t bytes_x = numX * sizeof(float);
size_t bytes_kValTile = numElems*sizeof(struct kValues);
clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
ComputeQ_GPU(numK, QGridBase, x, y, z, Qr, Qi, kValsTile);
clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK);
clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase);
clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d);
clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d);
clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d);
clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d);
clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d);
clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck);
CHECK_ERROR("clSetKernelArg")
printf ("Grid: %d, Block: %d\n", DimQGrid, DimQBlock);
clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
computeQ_GPU_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile);
printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock);
}
}
......
......@@ -25,14 +25,196 @@
*/
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <math.h>
#include <sys/time.h>
#include <parboil.h>
#include <CL/cl.h>
#include "ocl.h"
#include "file.h"
#include <endian.h>
#include <inttypes.h>
#include "macros.h"
#include "computeQ.h"
#define NC 4
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
void inputData(char* fName, int* _numK, int* _numX,
float** kx, float** ky, float** kz,
float** x, float** y, float** z,
float** phiR, float** phiI)
{
int numK, numX;
FILE* fid = fopen(fName, "r");
if (fid == NULL)
{
fprintf(stderr, "Cannot open input file\n");
exit(-1);
}
fread (&numK, sizeof (int), 1, fid);
*_numK = numK;
fread (&numX, sizeof (int), 1, fid);
*_numX = numX;
*kx = (float *) memalign(16, numK * sizeof (float));
fread (*kx, sizeof (float), numK, fid);
*ky = (float *) memalign(16, numK * sizeof (float));
fread (*ky, sizeof (float), numK, fid);
*kz = (float *) memalign(16, numK * sizeof (float));
fread (*kz, sizeof (float), numK, fid);
*x = (float *) memalign(16, numX * sizeof (float));
fread (*x, sizeof (float), numX, fid);
*y = (float *) memalign(16, numX * sizeof (float));
fread (*y, sizeof (float), numX, fid);
*z = (float *) memalign(16, numX * sizeof (float));
fread (*z, sizeof (float), numX, fid);
*phiR = (float *) memalign(16, numK * sizeof (float));
fread (*phiR, sizeof (float), numK, fid);
*phiI = (float *) memalign(16, numK * sizeof (float));
fread (*phiI, sizeof (float), numK, fid);
fclose (fid);
}
void outputData(char* fName, float* outR, float* outI, int numX)
{
FILE* fid = fopen(fName, "w");
uint32_t tmp32;
if (fid == NULL)
{
fprintf(stderr, "Cannot open output file\n");
exit(-1);
}
/* Write the data size */
tmp32 = numX;
fwrite(&tmp32, sizeof(uint32_t), 1, fid);
/* Write the reconstructed data */
fwrite (outR, sizeof (float), numX, fid);
fwrite (outI, sizeof (float), numX, fid);
fclose (fid);
}
void __attribute__ ((noinline)) computePhiMag_kernel(float* phiR, size_t bytes_phiR, float* phiI, size_t bytes_phiI, float* phiMag, size_t bytes_phiMag, int numK) {
int indexK = get_global_id(0);
if (indexK < numK) {
float real = phiR[indexK];
float imag = phiI[indexK];
phiMag[indexK] = real*real + imag*imag;
}
}
void __attribute__ ((noinline)) computePhiMag(int numK, float* phiR, float* phiI, float* phiMag)
{
int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
phiMagBlocks++;
size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
size_t bytes_phi = numK * sizeof(float);
computePhiMag_kernel(phiR, bytes_phi, phiI, bytes_phi, phiMag, bytes_phi, numK);
}
void __attribute__ ((noinline)) computeQ_kernel(int numK, int kGlobalIndex,
float* x, size_t bytes_x, float* y, size_t bytes_y, float* z, size_t bytes_z,
float* Qr, size_t bytes_Qr, float* Qi, size_t bytes_Qi, struct kValues* ck, size_t bytes_ck)
{
float sX[NC];
float sY[NC];
float sZ[NC];
float sQr[NC];
float sQi[NC];
#pragma unroll
for (int tx = 0; tx < NC; tx++) {
int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
sX[tx] = x[xIndex];
sY[tx] = y[xIndex];
sZ[tx] = z[xIndex];
sQr[tx] = Qr[xIndex];
sQi[tx] = Qi[xIndex];
}
// Loop over all elements of K in constant mem to compute a partial value
// for X.
int kIndex = 0;
for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID) && (kGlobalIndex < numK);
kIndex ++, kGlobalIndex ++) {
float kx = ck[kIndex].Kx;
float ky = ck[kIndex].Ky;
float kz = ck[kIndex].Kz;
float pm = ck[kIndex].PhiMag;
#pragma unroll
for (int tx = 0; tx < NC; tx++) {
float expArg = PIx2 *
(kx * sX[tx] +
ky * sY[tx] +
kz * sZ[tx]);
sQr[tx] += pm * cos(expArg);
sQi[tx] += pm * sin(expArg);
}
}
#pragma unroll
for (int tx = 0; tx < NC; tx++) {
int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + NC * get_local_id(0) + tx;
Qr[xIndex] = sQr[tx];
Qi[xIndex] = sQi[tx];
}
}
void __attribute__ ((noinline)) computeQ (int numK,int numX,
float* x, float* y, float* z,
struct kValues* kVals,
float* Qr, float* Qi
)
{
int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
QGrids++;
int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
if (numX % KERNEL_Q_THREADS_PER_BLOCK)
QBlocks++;
size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
//ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
// size in bytes = numElems*sizeof(struct kValues))
int QGrid;
for (QGrid = 0; QGrid < QGrids; QGrid++) {
// Put the tile of K values into constant mem
int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
struct kValues* kValsTile = kVals + QGridBase;
int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
size_t bytes_x = numX * sizeof(float);
size_t bytes_kValTile = numElems*sizeof(struct kValues);
computeQ_kernel(numK, QGridBase, x, bytes_x, y, bytes_x, z, bytes_x, Qr, bytes_x, Qi, bytes_x, kValsTile, bytes_kValTile);
printf ("Grid: %lu, Block: %lu\n", DimQGrid, DimQBlock);
}
}
void createDataStructsCPU(int numK, int numX, float** phiMag,
float** Qr, float** Qi)
{
*phiMag = (float* ) memalign(16, numK * sizeof(float));
*Qr = (float*) memalign(16, numX * sizeof (float));
*Qi = (float*) memalign(16, numX * sizeof (float));
}
int
main (int argc, char *argv[]) {
......@@ -96,10 +278,10 @@ main (int argc, char *argv[]) {
/* GPU section 1 (precompute PhiMag) */
{
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
computePhiMag_GPU(numK, phiR, phiI, phiMag);
computePhiMag(numK, phiR, phiI, phiMag);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
......@@ -118,14 +300,14 @@ main (int argc, char *argv[]) {
}
free(phiMag);
/* GPU section 2 */
{
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
computeQ_GPU(numK, numX, x, y, z, kVals, Qr, Qi);
computeQ(numK, numX, x, y, z, kVals, Qr, Qi);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
......
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment