Skip to content
Snippets Groups Projects
Commit a532ece7 authored by Hashim Sharif's avatar Hashim Sharif
Browse files

Adding New Sampling Knobs to tensorConvSampSim -- with half interpolation

parent 714bef9e
No related branches found
No related tags found
No related merge requests found
......@@ -18,15 +18,24 @@ perf,135 4,1,0 1.33 tensorConvolution tensorConvApprox dev conv
perf,136 4,1,1 1.33 tensorConvolution tensorConvApprox dev conv
perf,137 4,1,2 1.33 tensorConvolution tensorConvApprox dev conv
perf,138 4,1,3 1.33 tensorConvolution tensorConvApprox dev conv
samp,231 2,0 2.0 tensorConvolution tensorConvApprox dev conv
samp,232 2,1 2.0 tensorConvolution tensorConvApprox dev conv
samp,233 3,0 1.5 tensorConvolution tensorConvApprox dev conv
samp,234 3,1 1.5 tensorConvolution tensorConvApprox dev conv
samp,235 3,2 1.5 tensorConvolution tensorConvApprox dev conv
samp,236 4,0 1.33 tensorConvolution tensorConvApprox dev conv
samp,237 4,1 1.33 tensorConvolution tensorConvApprox dev conv
samp,238 4,2 1.33 tensorConvolution tensorConvApprox dev conv
samp,239 4,3 1.33 tensorConvolution tensorConvApprox dev conv
samp,231 2,0,0 2.0 tensorConvolution tensorConvApprox dev conv
samp,232 2,1,0 2.0 tensorConvolution tensorConvApprox dev conv
samp,233 3,0,0 1.5 tensorConvolution tensorConvApprox dev conv
samp,234 3,1,0 1.5 tensorConvolution tensorConvApprox dev conv
samp,235 3,2,0 1.5 tensorConvolution tensorConvApprox dev conv
samp,236 4,0,0 1.33 tensorConvolution tensorConvApprox dev conv
samp,237 4,1,0 1.33 tensorConvolution tensorConvApprox dev conv
samp,238 4,2,0 1.33 tensorConvolution tensorConvApprox dev conv
samp,239 4,3,0 1.33 tensorConvolution tensorConvApprox dev conv
samp,240 2,0,1 2.0 tensorConvolution tensorConvApprox dev conv
samp,241 2,1,1 2.0 tensorConvolution tensorConvApprox dev conv
samp,242 3,0,1 1.5 tensorConvolution tensorConvApprox dev conv
samp,243 3,1,1 1.5 tensorConvolution tensorConvApprox dev conv
samp,244 3,2,1 1.5 tensorConvolution tensorConvApprox dev conv
samp,245 4,0,1 1.33 tensorConvolution tensorConvApprox dev conv
samp,246 4,1,1 1.33 tensorConvolution tensorConvApprox dev conv
samp,247 4,2,1 1.33 tensorConvolution tensorConvApprox dev conv
samp,248 4,3,1 1.33 tensorConvolution tensorConvApprox dev conv
red_samp,41 1 1.5 tensorReduction tensorReduction dev red
red_samp,42 1 2.25 tensorReduction tensorReduction dev red
red_samp,43 1 1.4 tensorReduction tensorReduction dev red
......
......@@ -67,7 +67,17 @@
# After Fixing Yasmin's code first batch of runs on CIFAR-10 DNNs
# NOTE: First batch with 33% sampling - 2K runs for each threshold
batch_id = "batch325"
#-- batch_id = "batch325"
# After Fixing Yasmin's code second batch of runs on CIFAR-10 DNNs
# NOTE: Second batch with 33% sampling - 5K runs for each threshold
# NOTE: First batch with dumping CPU runtime configs
#-- batch_id = "batch327"
# IMP: Increased SAMPLING Knobs ---- Adding interpolation-based Knobs
batch_id = "batch328"
......
......@@ -19,7 +19,7 @@ class DevTimeTuner:
def __init__(self, Bench):
self.piped_execution = True
self.autotuner_runs = 100
self.autotuner_runs = 8000
self.promise_binary = Bench.promise_binary
......
......@@ -55,7 +55,8 @@ void postInterpolateRow(int N, int n, int c, int h, int w,
__global__
void postInterpolateCol(int N, int n, int c, int h, int w, float* data, int int_col){
void postInterpolateCol(int N, int n, int c, int h, int w,
float* data, int int_col){
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
......@@ -254,7 +255,6 @@ void sampleFilterElems(int N,
int ch = (i % (c * h * w)) / (h * w);
int n = i / (c * h * w);
//int local_index = row * w + col;
int local_index = (ch * (h * w)) + (row * w) + col;
if(skip_elem == 3 && h == 3 && w == 3){
......@@ -284,12 +284,15 @@ void sampleFilter(Tensor* newFilter, Tensor* filter,
int numBlocks = (n * c * h * w + 127) / 128;
int N = n * c * h * w;
//float mul_factor = skip_rate / (skip_rate - 1);
float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
//float mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
//mul_factor = (mul_factor + 1.0) / 2;
DEBUG ("mul_factor = %f \n", mul_factor);
//printf ("*** skip_rate = %d h = %d w = %d \n", skip_rate, h, w);
sampleFilterElems<<<numBlocks,128>>>(N,
n, c, h, w,
......@@ -335,16 +338,17 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
Tensor* newFilter;
newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
filter->dims.dim_sizes[3]);
CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
filter->dims.dim_sizes[3]);
// Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
sampleFilter(newFilter, filter, skip_rate, skip_offset);
INFO("vertical_stride = %lu, horizontal_stride = %lu \n", vertical_stride, horizontal_stride);
INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
vertical_stride, horizontal_stride);
checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
......@@ -453,6 +457,194 @@ void* tensorConvSampSim(void* input_ptr, void* filter_ptr,
void sampleFilter2(Tensor* newFilter, Tensor* filter,
int skip_rate, int skip_offset, int interpolation_id){
int n = filter->dims.dim_sizes[0];
int c = filter->dims.dim_sizes[1];
int h = filter->dims.dim_sizes[2];
int w = filter->dims.dim_sizes[3];
int numBlocks = (n * c * h * w + 127) / 128;
int N = n * c * h * w;
float mul_factor;
if (interpolation_id == 0){
mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
}
else if (interpolation_id == 1){
mul_factor = (skip_rate * 1.0) / (skip_rate - 1);
mul_factor = (mul_factor + 1.0) / 2;
}
printf ("mul_factor = %f \n", mul_factor);
DEBUG ("mul_factor = %f \n", mul_factor);
sampleFilterElems<<<numBlocks,128>>>(N,
n, c, h, w,
(float *) filter->gpu_data,
skip_rate, skip_offset, mul_factor,
(float *) newFilter->gpu_data);
}
// A 'Simulation' of perforated tensor convolution
void* tensorConvSampSim2(void* input_ptr, void* filter_ptr,
int vertical_pad, int horizontal_pad,
int vertical_stride, int horizontal_stride,
int conv_mode, int conv_groups,
int skip_rate, int skip_offset, int interpolation_id){
INFO("*** TensorConvolution \n");
profileEvent("tensorConv");
Tensor* input = (Tensor*) input_ptr;
Tensor* filter = (Tensor*) filter_ptr;
cudnnConvolutionDescriptor_t convDesc;
cudnnConvolutionFwdAlgo_t convAlgo;
cudnnConvolutionMode_t mode;
if(conv_mode == 0)
mode = CUDNN_CONVOLUTION;
else if(conv_mode == 1)
mode = CUDNN_CROSS_CORRELATION;
float alpha = 1.0f, beta = 0.0f;
hostToDeviceCopy(input);
hostToDeviceCopy(filter);
convertToFP32(input);
convertToFP32(filter);
Tensor* newFilter;
newFilter = (Tensor *) create4DTensor((cudnnDataType_t) float_type,
CUDNN_TENSOR_NCHW, filter->dims.dim_sizes[0],
filter->dims.dim_sizes[1], filter->dims.dim_sizes[2],
filter->dims.dim_sizes[3]);
// Zeroing (+Scaling) Filter elements to 'Simulate' input sampling
sampleFilter2(newFilter, filter, skip_rate, skip_offset, interpolation_id);
INFO("vertical_stride = %lu, horizontal_stride = %lu \n",
vertical_stride, horizontal_stride);
checkCUDNN(cudnnCreateConvolutionDescriptor(&convDesc));
//FIXME: Current hack to preserve backward compatibilty
if(conv_groups == 0){
conv_groups = 1;
}
// NOTE: Adding support for grouped convolution
checkCUDNN(cudnnSetConvolutionGroupCount(convDesc, conv_groups));
int new_v = vertical_stride + 0;
int new_h = horizontal_stride + 0;
cudnnDataType_t computeType = CUDNN_DATA_FLOAT;
checkCUDNN(cudnnSetConvolution2dDescriptor(convDesc,
vertical_pad, horizontal_pad, // conv padding
new_v, new_h, // conv strides
1, 1, // upscaling values
mode , // mode is configurable
computeType)); // defines compute precision
int n, c, h, w; // output dimensions
// Find dimension of convolution output
checkCUDNN(cudnnGetConvolution2dForwardOutputDim(convDesc,
input->tensor_desc,
filter->filter_desc,
&n, &c, &h, &w));
DEBUG("**Output Tensor Dims, n = %d, c = %d, h = %d, w = %d \n", n, c, h, w);
Tensor* output;
output = (Tensor*) create4DTensor((cudnnDataType_t) float_type,
CUDNN_TENSOR_NCHW, n, c, h, w);
// NOTE: Changing output tensor placement from host to device
changeTensorPlacement(output, DEVICE);
// NOTE: Necessary to insert the above call for every output tensor
DEBUG("tensor->data_type = %d, tensor->data_format = %d, N = %d, C = %d, H = %d, W = %d \n",
output->data_type, output->data_format, output->dims.dim_sizes[0],
output->dims.dim_sizes[1],
output->dims.dim_sizes[2], output->dims.dim_sizes[3]);
if(convDesc == NULL || input->tensor_desc == NULL ||
filter->filter_desc == NULL || output->tensor_desc == NULL)
ERROR("NULL descriptor! \n");
// NOTE-FIXIT: function failing for NHWC formats - perhaps some CUDNN support is lacking
checkCUDNN(cudnnGetConvolutionForwardAlgorithm(cudnnHandle,
input->tensor_desc,
filter->filter_desc,
convDesc,
output->tensor_desc,
CUDNN_CONVOLUTION_FWD_PREFER_FASTEST,
//CUDNN_CONVOLUTION_FWD_NO_WORKSPACE,
0,
&convAlgo));
DEBUG("ConvAlgo = %d, FFT = %d, GEMM = %d, WINOGRAD = %d \n", convAlgo,
CUDNN_CONVOLUTION_FWD_ALGO_FFT, CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD);
// NOTE: Using GEMM-based Algo
convAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
size_t workspace_size;
checkCUDNN(cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle,
input->tensor_desc,
filter->filter_desc,
convDesc,
output->tensor_desc,
convAlgo,
&workspace_size));
// Allocating memory for the convolution workspace
void* workspace;
checkCudaErrors(cudaMalloc(&workspace, workspace_size));
DEBUG("workspace size = %d \n", workspace_size);
checkCUDNN(cudnnConvolutionForward(cudnnHandle, &alpha, input->tensor_desc,
input->gpu_data, filter->filter_desc, newFilter->gpu_data,
convDesc, convAlgo, workspace, workspace_size,
&beta, output->tensor_desc, output->gpu_data));
freeTensor(newFilter);
profileEvent("tensorConv_end", true);
return output;
}
/************ NOTE: API for ApproxHPVM Wrapper runtime *******/
......@@ -833,15 +1025,17 @@ class SampParams{
public:
int skip_rate;
int skip_offset;
int interpolation_id;
SampParams(){
skip_rate = 1;
skip_offset = 0;
}
SampParams(int skip_rate1, int skip_offset1){
SampParams(int skip_rate1, int skip_offset1, int interpolation_id1){
skip_rate = skip_rate1;
skip_offset = skip_offset1;
interpolation_id = interpolation_id1;
}
};
......@@ -892,8 +1086,11 @@ public:
std::getline(token_stream, tok, ',');
int offset = atoi(tok.c_str());
std::getline(token_stream, tok, ',');
int interpolation_id = atoi(tok.c_str());
printf ("skip_every = %d, offset = %d \n", skip_every, offset);
SampParams params(skip_every, offset);
SampParams params(skip_every, offset, interpolation_id);
samp_knob_map[knob] = params;
}
......@@ -990,22 +1187,23 @@ void* Autotuner_SampConv(void* input, float i_min, float i_max,
DEBUG("params.skip_rate = %d, params.skip_offset = %d \n",
params.skip_rate, params.skip_offset);
/*
conv_out = tensorConvolutionKernelSamp(input, filter, conv_pad_h, conv_pad_w,
conv_stride_h, conv_stride_w,
1, 1,
2);
*/
void* conv_out;
if (!FP16_tuning){
conv_out = tensorConvSampSim(input, filter,
/* conv_out = tensorConvSampSim(input, filter,
conv_pad_h, conv_pad_w,
conv_stride_h, conv_stride_w, 1, 1,
params.skip_rate, params.skip_offset);
*/
conv_out = tensorConvSampSim2(input, filter,
conv_pad_h, conv_pad_w,
conv_stride_h, conv_stride_w, 1, 1,
params.skip_rate, params.skip_offset, params.interpolation_id);
}
else{
......@@ -1174,9 +1372,9 @@ void* Autotuner_Pooling(void* input,
else {
pool_out = tensorHalfPooling(input, 0, pool_size, pool_size,
0, 0, pool_stride, pool_stride);
}
0, 0, pool_stride, pool_stride);
}
}
else{
......@@ -1190,7 +1388,8 @@ void* Autotuner_Pooling(void* input,
void* Autotuner_Activation(void* input, int activation_id, int out_min, int out_max, int swing){
void* Autotuner_Activation(void* input, int activation_id,
int out_min, int out_max, int swing){
void* activation_out;
......@@ -1243,7 +1442,8 @@ void* Autotuner_Activation(void* input, int activation_id, int out_min, int out_
return activation_out;
}
void *autotuner_tensorFft(void *input, bool inverse) {
void* autotuner_tensorFft(void *input, bool inverse) {
if(ONLINE_PROFILING){
ERROR("Online Profiling cannot be enabled\n");
abort();
......@@ -1264,7 +1464,8 @@ void *autotuner_tensorFft(void *input, bool inverse) {
return NULL;
}
void *autotuner_tensorReduce(void *input, size_t axis, MathOp func) {
void* autotuner_tensorReduce(void *input, size_t axis, MathOp func) {
if(ONLINE_PROFILING){
ERROR("Online Profiling cannot be enabled\n");
abort();
......@@ -1295,7 +1496,7 @@ void *autotuner_tensorReduce(void *input, size_t axis, MathOp func) {
return NULL;
}
void *autotuner_tensorProjectiveT(void *input, void *transformation) {
void* autotuner_tensorProjectiveT(void *input, void *transformation) {
if(ONLINE_PROFILING){
ERROR("Online Profiling cannot be enabled\n");
abort();
......@@ -1313,7 +1514,8 @@ void *autotuner_tensorProjectiveT(void *input, void *transformation) {
return NULL;
}
void *autotuner_tensorMap1(MathOp func, void *input) {
void* autotuner_tensorMap1(MathOp func, void *input) {
if(ONLINE_PROFILING){
ERROR("Online Profiling cannot be enabled\n");
abort();
......@@ -1335,7 +1537,7 @@ void *autotuner_tensorMap1(MathOp func, void *input) {
return NULL;
}
void *autotuner_tensorMap2(MathOp func, void *input1, void *input2) {
void* autotuner_tensorMap2(MathOp func, void *input1, void *input2) {
if(ONLINE_PROFILING){
ERROR("Online Profiling cannot be enabled\n");
abort();
......@@ -1357,7 +1559,7 @@ void *autotuner_tensorMap2(MathOp func, void *input1, void *input2) {
return NULL;
}
void *autotuner_tensorMap3(MathOp func, void *input1, void *input2,
void* autotuner_tensorMap3(MathOp func, void *input1, void *input2,
void *input3) {
if(ONLINE_PROFILING){
ERROR("Online Profiling cannot be enabled\n");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment