Skip to content
Snippets Groups Projects
Commit ae440dd9 authored by Maria Kotsifakou's avatar Maria Kotsifakou
Browse files

Changes to reduction node

parent 24e3f957
No related branches found
No related tags found
No related merge requests found
......@@ -75,8 +75,6 @@ struct __attribute__((__packed__)) InStruct {
size_t bytesSy;
int m;
int n;
int* barrier;
size_t bytesBarrier;
int block_x;
int grid_x;
};
......@@ -94,7 +92,6 @@ void packData(struct InStruct* args, float* I, size_t bytesI,
float* Sx, size_t bytesSx,
float* Sy, size_t bytesSy,
int m, int n,
int* barrier, size_t bytesBarrier,
int block_x, int grid_x) {
args->I = I;
args->bytesI = bytesI;
......@@ -120,8 +117,6 @@ void packData(struct InStruct* args, float* I, size_t bytesI,
args->bytesSy = bytesSy;
args->m = m;
args->n = n;
args->barrier = barrier;
args->bytesBarrier = bytesBarrier;
args->block_x = block_x;
args->grid_x = grid_x;
}
......@@ -522,74 +517,42 @@ void WrapperComputeGradient(float *Is, size_t bytesIs,
/*
* Reduction
* G : input
* maxG: output. Needs to be initialized to 0.0 every time
* maxG: output
* m, n: input size
* barrier: variable used for syncronization
* - Needs to be initialized to the number of thread blocks (= number of SMs)
* every time.
* Needs a single thread block
*/
void computeMaxGradientLeaf(float *G, size_t bytesG,
float *maxG, size_t bytesMaxG,
int m, int n,
int *barrier, size_t bytesBarrier) {
int m, int n) {
__visc__hint(visc::DEVICE);
//TODO: maxG should be initialized to zero (MIN_BR) every time
__visc__attributes(3, G, maxG, barrier, 1, maxG);
__visc__attributes(1, G, 1, maxG);
void* thisNode = __visc__getNode();
void* parentNode = __visc__getParentNode(thisNode);
int lx = __visc__getNodeInstanceID_x(thisNode); // threadIdx.x
int px = __visc__getNodeInstanceID_x(parentNode); // blockIdx.x
int dimx = __visc__getNumNodeInstances_x(thisNode); // blockDim.x
int pdimx = __visc__getNumNodeInstances_x(parentNode); // gridDim.x
int gid = lx + px*dimx;
int gridSize = dimx*pdimx;
// Thread blocks iterate over all elements
for (int i = gid+gridSize; i < m*n; i+= gridSize) {
if (G[gid] < G[i])
G[gid] = G[i];
}
// 1st warp of each thread block iterates over all the elements of the block
// Needs blockDim.x to be multiple of 32
if (lx < 32) {
for (int i = 32; (i < dimx) && (gid+i < m*n) ; i += 32) {
if (G[gid] < G[gid+i])
G[gid] = G[gid+i];
}
}
// Reduction within the warp
for (unsigned stride = 16; stride >= 1; stride >>= 1) {
if ((gid + stride < m*n) && (lx < stride))
if (G[gid] < G[gid + stride])
G[gid] = G[gid + stride];
// Assume a single thread block
// Thread block iterates over all elements
for (int i = lx + dimx; i < m*n; i+= dimx) {
if (G[lx] < G[i])
G[lx] = G[i];
}
// Global Barrier, to ensure that all blocks have computed their max values
// First thread iterates over all elements of the thread block
if (lx == 0) {
__visc__atomic_add(barrier,1);
while (*barrier < pdimx) {}
}
__visc__barrier();
for (int i = 1; (i < dimx) && (i < m*n); i++)
if (G[lx] < G[i])
G[lx] = G[i];
// Thread 0 computes the global maximum of all thread blocks's maxima
if (gid == 0) {
for (int i = dimx; i < gridSize ; i += dimx)
if (G[gid] < G[gid + i])
G[gid] = G[gid + i];
*maxG = G[lx];
}
__visc__return(m);
}
/*
* Reduction
* G : input
......@@ -636,7 +599,6 @@ void computeMaxGradientLeaf(float *G, size_t bytesG,
void computeMaxGradientTB(float *G, size_t bytesG,
float *maxG, size_t bytesMaxG,
int m, int n,
int* barrier, size_t bytesBarrier,
int block_x) {
__visc__hint(visc::DEVICE);
__visc__attributes(2, G, maxG, 1, maxG);
......@@ -647,8 +609,6 @@ void computeMaxGradientTB(float *G, size_t bytesG,
__visc__bindIn(CMGLeafNode, 3, 3, 0); // Bind bytesMaxG
__visc__bindIn(CMGLeafNode, 4, 4, 0); // Bind m
__visc__bindIn(CMGLeafNode, 5, 5, 0); // Bind n
__visc__bindIn(CMGLeafNode, 6, 6, 0); // Bind barrier
__visc__bindIn(CMGLeafNode, 7, 7, 0); // Bind bytesBarrier
__visc__bindOut(CMGLeafNode, 0, 0, 0); // bind output m
}
......@@ -656,7 +616,6 @@ void computeMaxGradientTB(float *G, size_t bytesG,
void WrapperComputeMaxGradient(float *G, size_t bytesG,
float *maxG, size_t bytesMaxG,
int m, int n,
int* barrier, size_t bytesBarrier,
int block_x, int grid_x) {
__visc__hint(visc::CPU_TARGET);
__visc__attributes(2, G, maxG, 1, maxG);
......@@ -667,9 +626,7 @@ void WrapperComputeMaxGradient(float *G, size_t bytesG,
__visc__bindIn(CMGTBNode, 3, 3, 0); // Bind bytesMaxG
__visc__bindIn(CMGTBNode, 4, 4, 0); // Bind m
__visc__bindIn(CMGTBNode, 5, 5, 0); // Bind n
__visc__bindIn(CMGTBNode, 6, 6, 0); // Bind barrier
__visc__bindIn(CMGTBNode, 7, 7, 0); // Bind bytesBarrier
__visc__bindIn(CMGTBNode, 8, 8, 0); // Bind block_x
__visc__bindIn(CMGTBNode, 6, 6, 0); // Bind block_x
__visc__bindOut(CMGTBNode, 0, 0, 0); // bind output m
}
......@@ -743,9 +700,8 @@ void edgeDetection(float *I, size_t bytesI, // 0
float *Sy, size_t bytesSy, // 20
int m, // 22
int n, // 23
int* barrier, size_t bytesBarrier, // 24
int block_x, // 26
int grid_x // 27
int block_x, // 24
int grid_x // 25
) {
__visc__attributes(5, I, Gs, B, Sx, Sy, 6, Is, L, S, G, maxG, E);
void* GSNode = __visc__createNode(WrapperGaussianSmoothing);
......@@ -808,10 +764,8 @@ void edgeDetection(float *I, size_t bytesI, // 0
// __visc__bindIn(CMGNode, 22, 4, 1) // Bind m
__visc__edge(CGNode, CMGNode, 0, 4, 1); // Get m
__visc__bindIn(CMGNode, 23, 5, 1); // Bind n
__visc__bindIn(CMGNode, 24, 6, 1); // Bind barrier
__visc__bindIn(CMGNode, 25, 7, 1); // Bind bytesBarrier
__visc__bindIn(CMGNode, 26, 8, 1); // Bind block_x
__visc__bindIn(CMGNode, 27, 9, 1); // Bind grid_x
__visc__bindIn(CMGNode, 24, 6, 1); // Bind block_x
__visc__bindIn(CMGNode, 25, 7, 1); // Bind grid_x
// Reject ZC Inputs
__visc__bindIn(RZCNode, 6 , 0, 1); // Bind S
......@@ -920,8 +874,6 @@ int main (int argc, char *argv[]) {
std::vector<float> matE(matIrow*matIcol);
size_t bytesMaxG = sizeof(float);
float* maxG = (float*)malloc(bytesMaxG);
size_t bytesBarrier = sizeof(int);
int* barrier = (int*)malloc(bytesBarrier);
float B[] = { 0, 1, 0,
1, 1, 1,
......@@ -979,14 +931,12 @@ int main (int argc, char *argv[]) {
Sx, bytesSx,
Sy, bytesSy,
matIrow, matIcol,
barrier, bytesBarrier,
block_x, grid_x);
// Check if the total elements is a multiple of block size
assert(matIrow*matIcol % block_x == 0);
*maxG = 0.0;
*barrier = 0;
for(unsigned j=0; j<NUM_RUNS; j++) {
std::cout << "Run: " << j << "\n";
void* DFG = __visc__launch(1, edgeDetection, (void*)args);
......@@ -1002,7 +952,6 @@ int main (int argc, char *argv[]) {
llvm_visc_track_mem(B, bytesB);
llvm_visc_track_mem(Sx, bytesSx);
llvm_visc_track_mem(Sy, bytesSy);
llvm_visc_track_mem(barrier, bytesBarrier);
//packData(args, &matA[0], BlockSize, &matB[i], BlockSize, &matC[i], BlockSize, BlockElements);
//__visc__push(DFG, args);
......@@ -1021,7 +970,6 @@ int main (int argc, char *argv[]) {
llvm_visc_request_mem(&matG[0], I_sz);
llvm_visc_request_mem(maxG, bytesMaxG);
llvm_visc_request_mem(&matE[0], I_sz);
llvm_visc_request_mem(barrier, bytesBarrier);
llvm_visc_untrack_mem(&matI[0]);
llvm_visc_untrack_mem(&matIs[0]);
......@@ -1034,7 +982,6 @@ int main (int argc, char *argv[]) {
llvm_visc_untrack_mem(B);
llvm_visc_untrack_mem(Sx);
llvm_visc_untrack_mem(Sy);
llvm_visc_untrack_mem(barrier);
__visc__wait(DFG);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment