diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile index f74ee8921a534b6963ba06d089398114571d070b..96acb24ed36b2cef49d3882580831d67e7659114 100644 --- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/Makefile @@ -6,3 +6,4 @@ VISC_OBJS=main.visc.ll APP_CUDALDFLAGS=-lm -lstdc++ APP_CFLAGS=-ffast-math -O3 APP_CXXFLAGS=-ffast-math -O3 +APP_OPTFLAGS=-unroll-threshold=250 -loop-unroll -scalarrepl diff --git a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc index 25f67c31f21e5eea68966fd69d56805852ad5f5b..6906f419524463786795073320a06060351686d1 100644 --- a/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc +++ b/llvm/test/VISC/parboil/benchmarks/sgemm/src/visc_sh/main.cc @@ -40,23 +40,27 @@ extern char* readFile(const char*); } typedef struct __attribute__((__packed__)) { - float *A; size_t bytesA; - int lda; - float *B; size_t bytesB; - int ldb; - float *C; size_t bytesC; - int ldc; - int k; - float alpha; - float beta; - int block_x; - int block_y; - int grid_x; - int grid_y; -} RootIn; + float *A; + size_t bytesA; + int lda; + float *B; + size_t bytesB; + int ldb; + float *C; + size_t bytesC; + int ldc; + int k; + float alpha; + float beta; + int block_x; + int block_y; + int grid_x; + int grid_y; +} +RootIn; void packData(RootIn* args, - float *A, size_t bytesA, + float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, @@ -69,35 +73,35 @@ void packData(RootIn* args, int block_y, int grid_x, int grid_y) { - args->A = A; - args->bytesA = bytesA; - args->lda = lda; - args->B = B; - args->bytesB = bytesB; - args->ldb = ldb; - args->C = C; - args->bytesC = bytesC; - args->ldc = ldc; - args->k = k; - args->alpha = alpha; - args->beta = beta; - args->block_x = block_x; - args->block_y = block_y; - args->grid_x = grid_x; - args->grid_y = grid_y; + args->A = A; + args->bytesA = bytesA; + args->lda = lda; + args->B = B; + args->bytesB = bytesB; + args->ldb = ldb; + args->C = C; + args->bytesC = bytesC; + args->ldc = ldc; + args->k = k; + args->alpha = alpha; + args->beta = beta; + args->block_x = block_x; + args->block_y = block_y; + args->grid_x = grid_x; + args->grid_y = grid_y; } // TODO: decide between dynamic vs static allocation. Merely a convension - will // be translated. void Allocation(int block_x, int block_y) { - // Memory shared between threadblocks - //int bl_x = TILE_N; - //int bl_y = TILE_TB_HEIGHT; - //void* shB = __visc__malloc(bl_x*bl_y*sizeof(float)); - void* shB = __visc__malloc(block_x*block_y*sizeof(float)); - - //__visc__return(shB, bl_x*bl_y*sizeof(float)); - __visc__return(shB, block_x*block_y*sizeof(float)); + // Memory shared between threadblocks + //int bl_x = TILE_N; + //int bl_y = TILE_TB_HEIGHT; + //void* shB = __visc__malloc(bl_x*bl_y*sizeof(float)); + void* shB = __visc__malloc(block_x*block_y*sizeof(float)); + + //__visc__return(shB, bl_x*bl_y*sizeof(float)); + __visc__return(shB, block_x*block_y*sizeof(float)); } @@ -120,85 +124,36 @@ void SgemmLeaf( float* A, size_t bytesA, int lda, float* B, size_t bytesB, int l float c[TILE_N]; for (int i=0; i < TILE_N; i++) - c[i] = 0.0f; - //float c0 = 0; - //float c1 = 0; - //float c2 = 0; - //float c3 = 0; - //float c4 = 0; - //float c5 = 0; - //float c6 = 0; - //float c7 = 0; - //float c8 = 0; - //float c9 = 0; - //float c10 = 0; - //float c11 = 0; - //float c12 = 0; - //float c13 = 0; - //float c14 = 0; - //float c15 = 0; - + c[i] = 0.0f; + int mid = ly*dimx+lx; int m = gx * TILE_M + mid; int n = gy * TILE_N + lx; for (int i = 0; i < k; i+=TILE_TB_HEIGHT) { - float a; - //shB[ly][lx] = B[n+(i+ly)*ldb]; - shB[ly*dimx+lx] = B[n+(i+ly)*ldb]; - - __visc__barrier(); - for (int j = 0; j < TILE_TB_HEIGHT; j++) { - a = A[m + (i+j)*lda]; - for (int kk = 0; kk < TILE_N; kk++) { - //c[kk] += a * shB[j][kk]; - c[kk] += a * shB[j*dimx+kk]; - } - //c0 += a * shB[j*dimx+0]; - //c1 += a * shB[j*dimx+1]; - //c2 += a * shB[j*dimx+2]; - //c3 += a * shB[j*dimx+3]; - //c4 += a * shB[j*dimx+4]; - //c5 += a * shB[j*dimx+5]; - //c6 += a * shB[j*dimx+6]; - //c7 += a * shB[j*dimx+7]; - //c8 += a * shB[j*dimx+8]; - //c9 += a * shB[j*dimx+9]; - //c10 += a * shB[j*dimx+10]; - //c11 += a * shB[j*dimx+11]; - //c12 += a * shB[j*dimx+12]; - //c13 += a * shB[j*dimx+13]; - //c14 += a * shB[j*dimx+14]; - //c15 += a * shB[j*dimx+15]; - - } - __visc__barrier(); + float a; + //shB[ly][lx] = B[n+(i+ly)*ldb]; + shB[ly*dimx+lx] = B[n+(i+ly)*ldb]; + + __visc__barrier(); + for (int j = 0; j < TILE_TB_HEIGHT; j++) { + a = A[m + (i+j)*lda]; + for (int kk = 0; kk < TILE_N; kk++) { + //c[kk] += a * shB[j][kk]; + c[kk] += a * shB[j*dimx+kk]; + } + } + __visc__barrier(); } int t = ldc * gy * TILE_N + m; for (int i = 0; i < TILE_N; i++) { - C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i]; + C[t+i*ldc] = C[t+i*ldc] * beta + alpha * c[i]; } - //C[t+0*ldc] = C[t+0*ldc] * beta + alpha * c0; - //C[t+1*ldc] = C[t+1*ldc] * beta + alpha * c1; - //C[t+2*ldc] = C[t+2*ldc] * beta + alpha * c2; - //C[t+3*ldc] = C[t+3*ldc] * beta + alpha * c3; - //C[t+4*ldc] = C[t+4*ldc] * beta + alpha * c4; - //C[t+5*ldc] = C[t+5*ldc] * beta + alpha * c5; - //C[t+6*ldc] = C[t+6*ldc] * beta + alpha * c6; - //C[t+7*ldc] = C[t+7*ldc] * beta + alpha * c7; - //C[t+8*ldc] = C[t+8*ldc] * beta + alpha * c8; - //C[t+9*ldc] = C[t+9*ldc] * beta + alpha * c9; - //C[t+10*ldc] = C[t+10*ldc] * beta + alpha * c10; - //C[t+11*ldc] = C[t+11*ldc] * beta + alpha * c11; - //C[t+12*ldc] = C[t+12*ldc] * beta + alpha * c12; - //C[t+13*ldc] = C[t+13*ldc] * beta + alpha * c13; - //C[t+14*ldc] = C[t+14*ldc] * beta + alpha * c14; - //C[t+15*ldc] = C[t+15*ldc] * beta + alpha * c15; } // Thread block node for sgemm - Creates allocation node and leaf (thread) node -void SgemmTB(float *A, size_t bytesA, +void SgemmTB(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, @@ -209,38 +164,38 @@ void SgemmTB(float *A, size_t bytesA, float beta, int block_x, int block_y) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void* AllocationNode = __visc__createNode(Allocation); - void* SgemmLeafNode = __visc__createNode2D(SgemmLeaf, block_x, block_y); - - // Bind edges - __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta - - __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x - __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y - - - // Create Edges between AllocationNode and BFSLeafNodeNode - __visc__edge(AllocationNode, SgemmLeafNode, 0, 12, 0); // Edge local_B - __visc__edge(AllocationNode, SgemmLeafNode, 1, 13, 0); // Edge bytes_local_B - - //TODO: bindOut : for now with out attribute + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void* AllocationNode = __visc__createNode(Allocation); + void* SgemmLeafNode = __visc__createNode2D(SgemmLeaf, block_x, block_y); + + // Bind edges + __visc__bindIn(SgemmLeafNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmLeafNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmLeafNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmLeafNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmLeafNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmLeafNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmLeafNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmLeafNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmLeafNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmLeafNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmLeafNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmLeafNode, 11, 11, 0); // Bind beta + + __visc__bindIn(AllocationNode, 12, 0, 0); // Bind block_x + __visc__bindIn(AllocationNode, 13, 1, 0); // Bind block_y + + + // Create Edges between AllocationNode and BFSLeafNodeNode + __visc__edge(AllocationNode, SgemmLeafNode, 0, 12, 0); // Edge local_B + __visc__edge(AllocationNode, SgemmLeafNode, 1, 13, 0); // Edge bytes_local_B + + //TODO: bindOut : for now with out attribute } // Root node for sgemm - Creates thread block node -void SgemmRoot(float *A, size_t bytesA, +void SgemmRoot(float *A, size_t bytesA, int lda, float *B, size_t bytesB, int ldb, @@ -253,27 +208,27 @@ void SgemmRoot(float *A, size_t bytesA, int block_y, int grid_x, int grid_y) { - __visc__hint(visc::CPU_TARGET); - __visc__attributes(3, A, B, C, 1, C); - void* SgemmTBNode = __visc__createNode2D(SgemmTB, grid_x, grid_y); - - // Bind edges - __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A - __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA - __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda - __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B - __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB - __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb - __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C - __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC - __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc - __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k - __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha - __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta - __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x - __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y - - //TODO: bindOut : for now with out attribute + __visc__hint(visc::CPU_TARGET); + __visc__attributes(3, A, B, C, 1, C); + void* SgemmTBNode = __visc__createNode2D(SgemmTB, grid_x, grid_y); + + // Bind edges + __visc__bindIn(SgemmTBNode, 0, 0, 0); // Bind A + __visc__bindIn(SgemmTBNode, 1, 1, 0); // Bind bytesA + __visc__bindIn(SgemmTBNode, 2, 2, 0); // Bind lda + __visc__bindIn(SgemmTBNode, 3, 3, 0); // Bind B + __visc__bindIn(SgemmTBNode, 4, 4, 0); // Bind bytesB + __visc__bindIn(SgemmTBNode, 5, 5, 0); // Bind ldb + __visc__bindIn(SgemmTBNode, 6, 6, 0); // Bind C + __visc__bindIn(SgemmTBNode, 7, 7, 0); // Bind bytesC + __visc__bindIn(SgemmTBNode, 8, 8, 0); // Bind ldc + __visc__bindIn(SgemmTBNode, 9, 9, 0); // Bind k + __visc__bindIn(SgemmTBNode, 10, 10, 0); // Bind alpha + __visc__bindIn(SgemmTBNode, 11, 11, 0); // Bind beta + __visc__bindIn(SgemmTBNode, 12, 12, 0); // Bind block_x + __visc__bindIn(SgemmTBNode, 13, 13, 0); // Bind block_y + + //TODO: bindOut : for now with out attribute } // Creates root node for sgemm @@ -291,8 +246,8 @@ __attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int // In this code we assume the matrix sizes are multiple of tile size if ((m%TILE_M) || (n%TILE_N)) { - std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M - << "; n should be multiple of " << TILE_N << std::endl; + std::cerr << "unsupported size of matrix. m should be multiple of " << TILE_M + << "; n should be multiple of " << TILE_N << std::endl; return; } @@ -302,27 +257,27 @@ __attribute__((noinline)) void basicSgemm( char transa, char transb, int m, int // unsigned sgemmDFG = __visc__node(mysgemmNT, 2, 2, db[0], db[1], dg[0]/db[0], dg[1]/db[1], 12, A, bytesA, lda, B, bytesB, ldb, C, bytesC, ldc, k, alpha, beta, 0); - int block_x = TILE_N; - int block_y = TILE_TB_HEIGHT; - int grid_x = m/TILE_M; - int grid_y = n/TILE_N; - // Pack data in struct - RootIn* args = (RootIn*) malloc(sizeof(RootIn)); - packData(args, - A, bytesA, - lda, - B, bytesB, - ldb, - C, bytesC, - ldc, - k, - alpha, - beta, - block_x, - block_y, - grid_x, - grid_y - ); + int block_x = TILE_N; + int block_y = TILE_TB_HEIGHT; + int grid_x = m/TILE_M; + int grid_y = n/TILE_N; + // Pack data in struct + RootIn* args = (RootIn*) malloc(sizeof(RootIn)); + packData(args, + A, bytesA, + lda, + B, bytesB, + ldb, + C, bytesC, + ldc, + k, + alpha, + beta, + block_x, + block_y, + grid_x, + grid_y + ); void* sgemmDFG = __visc__launch(0, SgemmRoot, (void*) args); diff --git a/llvm/test/VISC/parboil/common/mk/visc.mk b/llvm/test/VISC/parboil/common/mk/visc.mk index 6c71fa2b69086b581d60661fa036af525f05ddad..ce2d2c6ecb8293aaa6c5a2555727d9c892259d58 100644 --- a/llvm/test/VISC/parboil/common/mk/visc.mk +++ b/llvm/test/VISC/parboil/common/mk/visc.mk @@ -97,6 +97,7 @@ TEST_OBJS = $(call INBUILDDIR,$(VISC_OBJS)) PARBOIL_OBJS = $(call INBUILDDIR,parboil.ll) KERNEL = $(TEST_OBJS).kernels.ll ifneq ($(TARGET),x86) + KERNEL_OPT = $(BUILDDIR)/$(APP).kernels.opt.ll KERNEL_LINKED = $(BUILDDIR)/$(APP).kernels.linked.ll #KERNEL = $(TEST_OBJS).kernels.ll PTX_ASSEMBLY = $(TEST_OBJS).nvptx.s @@ -142,9 +143,12 @@ clean : $(PTX_ASSEMBLY) : $(KERNEL_LINKED) $(CC) $(KERNEL_GEN_FLAGS) -S $< -o $@ -$(KERNEL_LINKED) : $(KERNEL) +$(KERNEL_LINKED) : $(KERNEL_OPT) $(LLVM_LINK) $(LIBCLC_NVPTX_LIB) -S $< -o $@ +$(KERNEL_OPT) : $(KERNEL) + $(OPT) $(APP_OPTFLAGS) -S $< -o $@ + $(BIN) : $(HOST_LINKED) $(CXX) -O3 $(LDFLAGS) $< -o $@