diff --git a/llvm/projects/hpvm-tensor-rt/bin/compute_install_times.py b/llvm/projects/hpvm-tensor-rt/bin/compute_install_times.py new file mode 100644 index 0000000000000000000000000000000000000000..6e59b72f023a7869e721ba62f923f5e4ca791113 --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/bin/compute_install_times.py @@ -0,0 +1,116 @@ + + +class TuningParameters: + def __init__(self): + self.iterations_measured = 150 + self.total_iterations = 30000 + + +tunerParams = TuningParameters() + + +class Benchmark: + def __init__(self): + self.binary_time = 0 + + +### All times are real profiled times on the Jetson Board +### Times are for 150 OpenTuner iterations on Jetson + +ResNet50 = Benchmark() +ResNet50.tuner_time = 3.85 * 100 * 150 # 50 images * 100 batches + +VGG16_ImageNet = Benchmark() +VGG16_ImageNet.tuner_time = 4.55 * 100 * 150 # 50 images * 100 batches + +AlexNet_ImageNet = Benchmark() +AlexNet_ImageNet.tuner_time = 0.7 * 100 * 150 + + +VGG16_CIFAR10 = Benchmark() +VGG16_CIFAR10.tuner_time = 1.54 * 60 * 60 # 50 images * 100 batches + + +VGG16_CIFAR100 = Benchmark() +VGG16_CIFAR100.tuner_time = 1.57 * 60 * 60 # 50 images * 100 batches + + +ResNet18 = Benchmark() +ResNet18.tuner_time = 0.52 * 60 * 60 # 12.9 measured for 1000 images + + +MobileNet = Benchmark() +MobileNet.tuner_time = 0.72 * 60 * 60 # 50 images * 100 batches + + +AlexNet_CIFAR10 = Benchmark() +AlexNet_CIFAR10.tuner_time = 0.67 * 60 * 60 # Time in hours + + +AlexNet2_CIFAR10 = Benchmark() +AlexNet2_CIFAR10.tuner_time = 0.19 * 60 * 60 + + +LeNet_CIFAR10 = Benchmark() +LeNet_CIFAR10.tuner_time = 0.11 * 60 * 60 + + + + + +def getInstallTime(Bench): + + ## We limit pareto configs to 50 after iterations of tuning complete + + tuner_invocations = tunerParams.total_iterations / tunerParams.iterations_measured + + extrapolated_time = tuner_invocations * Bench.tuner_time + + time_hours = extrapolated_time / (60 * 60) + + return time_hours + + + +# Routine to compute extrapolated tuning times +def computeExtrapolatedInstallTime(): + + + resnet50_time = getInstallTime(ResNet50) + print ("*** ResNet50 time (hrs) = ", resnet50_time) + + resnet18_time = getInstallTime(ResNet18) + print ("*** ResNet18 time (hrs) = ", resnet18_time) + + mobilenet_time = getInstallTime(MobileNet) + print ("*** MobileNet time (hrs) = ", mobilenet_time) + + vgg16_img_time = getInstallTime(VGG16_ImageNet) + print ("*** VGG16-Imagenet time (hrs) = ", vgg16_img_time) + + vgg16_cifar10_time = getInstallTime(VGG16_CIFAR10) + print ("*** VGG16-CIFAR10 time (hrs) = ", vgg16_cifar10_time) + + vgg16_cifar100_time = getInstallTime(VGG16_CIFAR100) + print ("*** VGG16-CIFAR100 time (hrs) = ", vgg16_cifar100_time) + + alexnet_img_time = getInstallTime(AlexNet_ImageNet) + print ("*** AlexNet-Imagenet time (hrs) = ", alexnet_img_time) + + alexnet_cifar10_time = getInstallTime(AlexNet_CIFAR10) + print ("*** AlexNet-CIFAR10 time (hrs) = ", alexnet_cifar10_time) + + alexnet2_cifar10_time = getInstallTime(AlexNet2_CIFAR10) + print ("*** AlexNet2-CIFAR10 time (hrs) = ", alexnet2_cifar10_time) + + lenet_cifar10_time = getInstallTime(LeNet_CIFAR10) + print ("*** LeNet-CIFAR10 time (hrs) = ", lenet_cifar10_time) + + + + + +if __name__ == "__main__": + + computeExtrapolatedInstallTime() + diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc index 0aa33bc43dace1f847f44ed7ad6dcfc0082d014a..65d6335f75fb5f3e9469e42507e063a2b526aee8 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc @@ -4,15 +4,18 @@ #include <unistd.h> #include <fcntl.h> #include <sys/types.h> -#include <sys/stat.h> +#include <sys/stat.h> +#include <vector> #include <string.h> #include "tensor_runtime.h" #include "utils.h" +#include "tensor_custom_ops_cpu.h" -Tensor *gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) { + +Tensor* gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) { int64_t m = (w - 1) / 2, n = (h - 1) / 2; auto *data = new float[w * h]; float sum = 0.0f; @@ -57,64 +60,50 @@ TODOs: ****/ void* canny_filter(void* dataset) { - Tensor *gaussian = gaussianFilter(1.4, 5, 5, 1); - Tensor *kernel_x, *kernel_y; + + Tensor* gaussian = gaussianFilter(1.4, 5, 5, 1); + Tensor* kernel_x, *kernel_y; std::tie(kernel_x, kernel_y) = getSobelKernels(); // 0. Grayscale - auto *summed_image = autotuner_tensorReduce(dataset, 1, MathOp::Add); - auto *grayscale_image = autotuner_tensorMap1(MathOp::Avg3, summed_image); + auto* summed_image = tensorReduce(dataset, 1, MathOp::Add); + auto* grayscale_image = tensorMap1(MathOp::Avg3, summed_image); // 1. Denoise - auto *image2 = ConvLayer_PROMISE(grayscale_image, 0.0, 0.0, gaussian, - 0.0, 0.0, nullptr, 0.0, 0.0, 2, 2, 1, - 1, 0, 0, -1, 0.0, 0.0, 0); + + auto* image2 = tensorConvolution(grayscale_image, gaussian, + 2, 2, // padding + 1, 1, // strides + 1, 0); // conv_mode, conv_groups + // 2. Get edge gradient / direction - auto *grad_x = ConvLayer_PROMISE( - image2, 0.0, 0.0, kernel_x, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0, - -1, 0.0, 0.0, 0); - auto *grad_y = ConvLayer_PROMISE( - image2, 0.0, 0.0, kernel_y, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0, - -1, 0.0, 0.0, 0); - auto *grad_mag = autotuner_tensorMap2(MathOp::Hypot, grad_x, grad_y); + auto *grad_x = tensorConvolution(image2, kernel_x, + 1, 1, + 1, 1, + 1, 0); + + auto *grad_y = tensorConvolution(image2, kernel_y, + 1, 1, + 1, 1, + 1, 0); + + auto *grad_mag = tensorMap2(MathOp::Hypot, grad_x, grad_y); // 2.5. Normalize grad magnitude - auto *grad_max_1D = autotuner_tensorReduce(grad_mag, 2, MathOp::Max); - auto *grad_max = autotuner_tensorReduce(grad_max_1D, 3, MathOp::Max); - auto *grad_mag_norm = autotuner_tensorMap2(MathOp::Div, grad_mag, grad_max); + auto *grad_max_1D = tensorReduce(grad_mag, 2, MathOp::Max); + auto *grad_max = tensorReduce(grad_max_1D, 3, MathOp::Max); + auto *grad_mag_norm = tensorMap2(MathOp::Div, grad_mag, grad_max); return grad_mag_norm; } -const size_t batch_size = 500, total_max = 3000; -const float psnr_threshold = 30.0; +void* invoke_canny(void* input) { + + auto* result = canny_filter(input); -int main() { - const char *input_path = "../model_params/image_processing_5k"; - const char *ref_output_path = "../model_params/canny_ref_output"; - std::vector<float> psnr; - llvm_hpvm_initTensorRt(1); - size_t bstart = 0; - startMemTracking(); - while (true) { - Tensor *batch = readDataSet(input_path, bstart, batch_size); - if (batch == nullptr) - break; - - auto *result = main_procedure(batch); - auto *ref_output = readDataSet(ref_output_path, bstart, batch_size, 1); - std::vector<float> psnr_batch = PSNR(ref_output, result); - std::copy(psnr_batch.begin(), psnr_batch.end(), std::back_inserter(psnr)); - bstart += batch_size; - if (bstart >= total_max) - break; - freeBatchMemory(); - } - float violation = violationRate(psnr, psnr_threshold); - float mean_psnr = mean(psnr); - std::ofstream of("final_accuracy"); - of << violation * 100 << ", " << mean_psnr << '\n'; - return 0; + printf("Done with Canny \n"); + + return result; } @@ -128,84 +117,128 @@ int main(){ llvm_hpvm_initTensorRt(0); - - //std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); - std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); + std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("norm_cifar_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + + void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin", + float_type, 32, 3, 3, 3); + void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin", + float_type, 32, 32, 3, 3); + void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin", + float_type, 1, 32, 1, 1); + void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin", + float_type, 64, 32, 3, 3); + void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin", + float_type, 1, 64, 1, 1); + void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin", + float_type, 64, 64, 3, 3); + void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin", + float_type, 1, 64, 1, 1); + void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin", + float_type, 128, 64, 3, 3); + void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin", + float_type, 1, 128, 1, 1); + void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin", + float_type, 128, 128, 3, 3); + void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin", + float_type, 1, 128, 1, 1); + + void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin", + float_type, 1, 1, 2048, 10); + void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin", + float_type, 1, 10, 1, 1); - std::string input_path = dir_prefix + std::string("input.bin"); - //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(),10000); - std::string conv2d_1_w_path = dir_prefix + std::string("conv0.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv_bias0.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv3.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv_bias3.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv6.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv_bias6.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv7.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv_bias7.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv8.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv_bias8.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("fc12.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); - std::string dense_1_b_path = dir_prefix + std::string("fc_bias12.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - - startMemTracking(); - int test_input_size = 2000; - int batch_size = 2000; + int test_input_size = 5000; + int batch_size = 500; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; // NOTE: Starting time profiling - startProfiling(); - + startProfiling(); + startMemTracking(); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + for(int i = 0; i < batch_count; i++){ int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorTanh(var_1); - void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); - void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); - void* var_6 = tensorAdd(var_5, conv2d_2_b); - void* var_7 = tensorTanh(var_6); - void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); - void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorAdd(var_10, conv2d_3_b); - void* var_12 = tensorTanh(var_11); - void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_14 = tensorAdd(var_13, conv2d_4_b); - void* var_15 = tensorTanh(var_14); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorTanh(var_17); - void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); - void* var_22 = tensorGemmGPU(var_19, dense_1_w); - void* var_23 = tensorAdd(var_22, dense_1_b); - void* var_24 = tensorSoftmax(var_23); + + + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv1out, conv1_bias); + void* conv1_tanh = tensorTanh(conv1out); + + // 2nd Layer + void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv2out, conv2_bias); + void* conv2_tanh = tensorTanh(conv2out); + void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 3rd Layer + void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv3out, conv3_bias); + void* conv3_tanh = tensorTanh(conv3out); + + // 4th Layer + void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv4out, conv4_bias); + void* conv4_tanh = tensorTanh(conv4out); + void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 5th Layer + void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv5out, conv5_bias); + void* conv5_tanh = tensorTanh(conv5out); + + // 6th Layer + void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv6out, conv6_bias); + + void* conv6_tanh = tensorTanh(conv6out); + void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + + // final FC Layer + void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + void* result = tensorSoftmax(gemm1biasout); uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - float accuracy = computeAccuracy2(labels,batch_size,var_24); + float accuracy = computeAccuracy2(labels, batch_size, result); final_accuracy += accuracy; + + + std::vector<int> index_vector; + index_vector.push_back(1); + index_vector.push_back(2); + index_vector.push_back(3); + index_vector.push_back(4); + index_vector.push_back(5); - freeBatchMemory(); + + void* argmax_out = tensorArgMax(result); + void* select_out = tensorSelect2(argmax_out, index_vector); + void* reduced_input = tensorContract(input, select_out); + + + invoke_canny(reduced_input); + + + freeBatchMemory(); } stopProfiling(); diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc index 5c48b3b01f2641576e6ac725ae0a81f03d6a5dbb..95b571e5a0d710cf71f0bb714e658420751abf53 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/test_ops.cc @@ -6,6 +6,7 @@ #include "tensor_runtime.h" #include "utils.h" +#include "tensor_custom_ops_cpu.h" void testTensorGemm(){ @@ -1098,6 +1099,105 @@ void testSampling_1_1(){ +void* testTensorArgMax(){ + + Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 3, 1, 1); + + float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + + // Input 0 + host_ptr[0] = 1; + host_ptr[1] = 7; // highest - max index = 1 + host_ptr[2] = 3; + + // Input 1 + host_ptr[3] = 3; + host_ptr[4] = 3; + host_ptr[5] = 8; // highest - max index = 2 + + // Input 2 + host_ptr[6] = 2; + host_ptr[7] = 5; + host_ptr[8] = 9; // highest - max index = 2 + + // Input 3 + host_ptr[9] = 11; // highest - max index = 0 + host_ptr[10] = 2; + host_ptr[11] = 8; + + void* argmax_out = tensorArgMax(input); + + // Expect Output of call below to be: + // 1 2 2 0 + printTensorValues(argmax_out); + + return argmax_out; +} + + + +void* testTensorSelect(void* argmax_out){ + + void* select_out = tensorSelect(argmax_out, 2); + printf ("***** tensorSelect output \n"); + + printTensorValues(select_out); + + return select_out; + +} + + +void testTensorContract(void* select_out){ + + Tensor* input = (Tensor*) create4DTensor(CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 4, 4, 1, 1); + float* host_ptr = (float*) ((struct Tensor*) input)->host_data; + + // Input 0 + host_ptr[0] = 1; + host_ptr[1] = 1; + host_ptr[2] = 1; + host_ptr[3] = 1; + + // Input 1 + host_ptr[4] = 2; + host_ptr[5] = 2; + host_ptr[6] = 2; + host_ptr[7] = 2; + + // Input 2 + host_ptr[8] = 3; + host_ptr[9] = 3; + host_ptr[10] = 3; + host_ptr[11] = 3; + + // Input 3 + host_ptr[12] = 4; + host_ptr[13] = 4; + host_ptr[14] = 4; + host_ptr[15] = 4; + + + void* contract_out = tensorContract(input, select_out); + printf ("***** tensorContract output \n"); + + printTensorValues(contract_out); + +} + + + +void testNewTensorOps(){ + + void* argmax_out = testTensorArgMax(); + void* select_out = testTensorSelect(argmax_out); + testTensorContract(select_out); + +} + + + + @@ -1137,21 +1237,22 @@ int main(){ // testPerforation2(); - - //testSampling(); - //testSampling2(); - - //testSampling3(); - + /********* SAMPLING TESTS **** testSampling_3_3(); testSampling_1_1(); + *************/ + + + testNewTensorOps(); + + //testQuantization(); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..b9128c1a24ca5bd95a7e6fb9e962d56501558f8f --- /dev/null +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h @@ -0,0 +1,178 @@ + + +#include "tensor.h" +#include <stdlib.h> +#include <vector> + + +void* tensorArgMax(void* input_ptr){ + + Tensor* input = (Tensor*) input_ptr; + float* host_ptr = (float*) input->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + + Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1); + changeTensorPlacement(output, HOST); + + float* out_ptr = (float*) output->host_data; + + for(int i = 0; i < batch_size; i++){ + + int start = i * channels; + float max_index = 0; + float max_val = host_ptr[start]; + for(int j = 0; j < channels; j++){ + + int index = start + j; + //printf ("index = %d \n", index); + float val = host_ptr[index]; + if (val > max_val){ + max_val = val; + max_index = j; + } + } + + out_ptr[i] = max_index; + } + + + return output; + +} + + + + + +void* tensorSelect(void* input_ptr, float target_value){ + + Tensor* input = (Tensor*) input_ptr; + float* host_ptr = (float*) input->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + + if (channels != 1){ + printf("* Channels dimension must be 1 \n"); + abort(); + } + + Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1); + changeTensorPlacement(output, HOST); + float* out_ptr = (float*) output->host_data; + + for(int i = 0; i < batch_size; i++){ + if (host_ptr[i] == target_value){ + out_ptr[i] = 1; + } + else{ + out_ptr[i] = 0; + } + } + + return output; +} + + + + +void* tensorSelect2(void* input_ptr, std::vector<int> index_vector){ + + Tensor* input = (Tensor*) input_ptr; + float* host_ptr = (float*) input->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + + if (channels != 1){ + printf("* Channels dimension must be 1 \n"); + abort(); + } + + Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1); + changeTensorPlacement(output, HOST); + float* out_ptr = (float*) output->host_data; + + for(int i = 0; i < batch_size; i++){ + + for(int j = 0; j < index_vector.size(); j++){ + int target_value = index_vector[j]; + if (host_ptr[i] == target_value){ + out_ptr[i] = 1; + break; + } + else{ + out_ptr[i] = 0; + } + } + + } + + return output; +} + + + + + + +long getOnesInVector(float* vector_host_ptr, long vector_length){ + + long ones_count = 0; + for(int i = 0; i < vector_length; i++){ + + if(vector_host_ptr[i] == 1) + ones_count += 1; + } + + return ones_count; +} + + +void* tensorContract(void* input_ptr, void* bitvector_ptr){ + + Tensor* input = (Tensor*) input_ptr; + float* host_ptr = (float*) input->host_data; + + Tensor* bitvector = (Tensor*) bitvector_ptr; + float* vector_host_ptr = (float*) bitvector->host_data; + long vector_length = bitvector->dims.dim_sizes[0]; + + long reduced_batch_size = getOnesInVector(vector_host_ptr, vector_length); + + long batch_size = input->dims.dim_sizes[0]; + long channels = input->dims.dim_sizes[1]; + long height = input->dims.dim_sizes[2]; + long width = input->dims.dim_sizes[3]; + + long image_size = channels * height * width; // Computing size of each image + + if (batch_size != vector_length){ + printf("ERROR: bitvector length has to match input batch size \n"); + abort(); + } + + Tensor* output = (Tensor *) create4DTensor(0, 0, reduced_batch_size, channels, height, width); + changeTensorPlacement(output, HOST); + float* out_ptr = (float*) output->host_data; + + long out_index = 0; + for(int i = 0; i < batch_size; i++){ + + // Include image if corresponding index in bitvector is '1' + if (vector_host_ptr[i] == 1){ + + for(int j = 0; j < image_size; j++){ + + out_ptr[j] = host_ptr[i * image_size + j]; + } + + out_ptr += image_size; // Update the output pointer to the next image boundary + } + } + + return output; +} + diff --git a/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt b/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt index 57f128d0e58bdad252b4b93cae526b8323d8779a..da42b2ad85397c72f2385724f4af52f3da6c0c78 100644 --- a/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt +++ b/llvm/projects/soc_simulator/resnet50_imagenet/resnet50_imagenet_ops.txt @@ -1,282 +1,282 @@ #Conv1,4 -Conv -Add -Relu -Pool -#NML1,1 +Conv1 +Add1 +Relu1 +Pool1 +BatchNorm1 #Conv2,2 -Conv -Add +Conv2 +Add2 +BatchNorm2 +#NML1,1 +#Conv3,2 +Conv3 +Add3 +BatchNorm3 #NML2,1 +#Conv4,2 +Conv4 +Add4 +BatchNorm4 +#Conv5,2 +Conv5 +Add5 +BatchNorm5 #NML3,1 -#Conv3,2 -Conv -Add #NML4,1 +#Conv6,2 +Conv6 +Add6 +BatchNorm6 #NML5,1 -#Conv4,2 -Conv -Add +#Conv7,2 +Conv7 +Add7 +BatchNorm7 #NML6,1 -#Conv5,2 -Conv -Add +#Conv8,2 +Conv8 +Add8 +BatchNorm8 #NML7,1 #NML8,1 +#Conv9,2 +Conv9 +Add9 +BatchNorm9 #NML9,1 -#Conv6,2 -Conv -Add +#Conv10,2 +Conv10 +Add10 +BatchNorm10 #NML10,1 +#Conv11,2 +Conv11 +Add11 +BatchNorm11 #NML11,1 -#Conv7,2 -Conv -Add #NML12,1 +#Conv12,2 +Conv12 +Add12 +BatchNorm12 #NML13,1 -#Conv8,2 -Conv -Add +#Conv13,2 +Conv13 +Add13 +BatchNorm13 #NML14,1 +#Conv14,2 +Conv14 +Add14 +BatchNorm14 +#Conv15,2 +Conv15 +Add15 +BatchNorm15 #NML15,1 #NML16,1 -#Conv9,2 -Conv -Add +#Conv16,2 +Conv16 +Add16 +BatchNorm16 #NML17,1 +#Conv17,2 +Conv17 +Add17 +BatchNorm17 #NML18,1 -#Conv10,2 -Conv -Add +#Conv18,2 +Conv18 +Add18 +BatchNorm18 #NML19,1 #NML20,1 -#Conv11,2 -Conv -Add +#Conv19,2 +Conv19 +Add19 +BatchNorm19 #NML21,1 +#Conv20,2 +Conv20 +Add20 +BatchNorm20 #NML22,1 +#Conv21,2 +Conv21 +Add21 +BatchNorm21 #NML23,1 -#Conv12,2 -Conv -Add #NML24,1 +#Conv22,2 +Conv22 +Add22 +BatchNorm22 #NML25,1 -#Conv13,2 -Conv -Add +#Conv23,2 +Conv23 +Add23 +BatchNorm23 #NML26,1 +#Conv24,2 +Conv24 +Add24 +BatchNorm24 #NML27,1 -#Conv14,2 -Conv -Add #NML28,1 -#Conv15,2 -Conv -Add +#Conv25,2 +Conv25 +Add25 +BatchNorm25 #NML29,1 +#Conv26,2 +Conv26 +Add26 +BatchNorm26 #NML30,1 +#Conv27,2 +Conv27 +Add27 +BatchNorm27 +#Conv28,2 +Conv28 +Add28 +BatchNorm28 #NML31,1 -#Conv16,2 -Conv -Add #NML32,1 +#Conv29,2 +Conv29 +Add29 +BatchNorm29 #NML33,1 -#Conv17,2 -Conv -Add +#Conv30,2 +Conv30 +Add30 +BatchNorm30 #NML34,1 +#Conv31,2 +Conv31 +Add31 +BatchNorm31 #NML35,1 -#Conv18,2 -Conv -Add #NML36,1 +#Conv32,2 +Conv32 +Add32 +BatchNorm32 #NML37,1 +#Conv33,2 +Conv33 +Add33 +BatchNorm33 #NML38,1 -#Conv19,2 -Conv -Add +#Conv34,2 +Conv34 +Add34 +BatchNorm34 #NML39,1 #NML40,1 -#Conv20,2 -Conv -Add +#Conv35,2 +Conv35 +Add35 +BatchNorm35 #NML41,1 +#Conv36,2 +Conv36 +Add36 +BatchNorm36 #NML42,1 -#Conv21,2 -Conv -Add +#Conv37,2 +Conv37 +Add37 +BatchNorm37 #NML43,1 #NML44,1 +#Conv38,2 +Conv38 +Add38 +BatchNorm38 #NML45,1 -#Conv22,2 -Conv -Add +#Conv39,2 +Conv39 +Add39 +BatchNorm39 #NML46,1 +#Conv40,2 +Conv40 +Add40 +BatchNorm40 #NML47,1 -#Conv23,2 -Conv -Add #NML48,1 +#Conv41,2 +Conv41 +Add41 +BatchNorm41 #NML49,1 -#Conv24,2 -Conv -Add +#Conv42,2 +Conv42 +Add42 +BatchNorm42 #NML50,1 +#Conv43,2 +Conv43 +Add43 +BatchNorm43 #NML51,1 #NML52,1 -#Conv25,2 -Conv -Add +#Conv44,2 +Conv44 +Add44 +BatchNorm44 #NML53,1 +#Conv45,2 +Conv45 +Add45 +BatchNorm45 #NML54,1 -#Conv26,2 -Conv -Add +#Conv46,2 +Conv46 +Add46 +BatchNorm46 +#Conv47,2 +Conv47 +Add47 +BatchNorm47 #NML55,1 #NML56,1 -#Conv27,2 -Conv -Add +#Conv48,2 +Conv48 +Add48 +BatchNorm48 #NML57,1 -#Conv28,2 -Conv -Add +#Conv49,2 +Conv49 +Add49 +BatchNorm49 #NML58,1 +#Conv50,2 +Conv50 +Add50 +BatchNorm50 #NML59,1 #NML60,1 -#Conv29,2 -Conv -Add +#Conv51,2 +Conv51 +Add51 +BatchNorm51 #NML61,1 +#Conv52,2 +Conv52 +Add52 +BatchNorm52 #NML62,1 -#Conv30,2 -Conv -Add +#Conv53,2 +Conv53 +Add53 +BatchNorm53 #NML63,1 #NML64,1 -#Conv31,2 -Conv -Add #NML65,1 -#NML66,1 -#NML67,1 -#Conv32,2 -Conv -Add -#NML68,1 -#NML69,1 -#Conv33,2 -Conv -Add -#NML70,1 -#NML71,1 -#Conv34,2 -Conv -Add -#NML72,1 -#NML73,1 -#NML74,1 -#Conv35,2 -Conv -Add -#NML75,1 -#NML76,1 -#Conv36,2 -Conv -Add -#NML77,1 -#NML78,1 -#Conv37,2 -Conv -Add -#NML79,1 -#NML80,1 -#NML81,1 -#Conv38,2 -Conv -Add -#NML82,1 -#NML83,1 -#Conv39,2 -Conv -Add -#NML84,1 -#NML85,1 -#Conv40,2 -Conv -Add -#NML86,1 -#NML87,1 -#NML88,1 -#Conv41,2 -Conv -Add -#NML89,1 -#NML90,1 -#Conv42,2 -Conv -Add -#NML91,1 -#NML92,1 -#Conv43,2 -Conv -Add -#NML93,1 -#NML94,1 -#NML95,1 -#Conv44,2 -Conv -Add -#NML96,1 -#NML97,1 -#Conv45,2 -Conv -Add -#NML98,1 -#NML99,1 -#Conv46,2 -Conv -Add -#NML100,1 -#Conv47,2 -Conv -Add -#NML101,1 -#NML102,1 -#NML103,1 -#Conv48,2 -Conv -Add -#NML104,1 -#NML105,1 -#Conv49,2 -Conv -Add -#NML106,1 -#NML107,1 -#Conv50,2 -Conv -Add -#NML108,1 -#NML109,1 -#NML110,1 -#Conv51,2 -Conv -Add -#NML111,1 -#NML112,1 -#Conv52,2 -Conv -Add -#NML113,1 -#NML114,1 -#Conv53,2 -Conv -Add -#NML115,1 -#NML116,1 -#NML117,1 -#NML118,1 #FC1,2 -Mul -Add +Mul1 +Add54 diff --git a/llvm/projects/soc_simulator/scripts/construct_ops.py b/llvm/projects/soc_simulator/scripts/construct_ops.py index 3b655f2f5fb7ccb3eb4ac8db2e105cb74d71f986..3bcb2af9c345b19d86acaf92e3771c60370b4678 100644 --- a/llvm/projects/soc_simulator/scripts/construct_ops.py +++ b/llvm/projects/soc_simulator/scripts/construct_ops.py @@ -11,7 +11,7 @@ op_map["pool"] = "Pool" op_map["relu"] = "Relu" op_map["activation"] = "Relu" op_map["tanh"] = "Tanh" -op_map["batchnorm"] = "NML" +op_map["batchnorm"] = "BatchNorm" unique_op_map = {} @@ -45,19 +45,29 @@ if __name__ == "__main__": f2 = open(out_path, "w+") - nml_id = 1 conv_id = 1 fc_id = 1 + batchnorm_id = 1 for x in f: toks = x.split() layer_len = len(toks) + #if "batchnorm" in toks: + # f2.write("BatchNorm" + str(batchnorm_id) + "\n") + # batchnorm_id += 1 + # continue + if layer_len == 1 and "conv" not in toks and "dense" not in toks: f2.write("#NML" + str(nml_id) + ",1\n") nml_id += 1 + + layer_str = getLayerStr(toks) + f2.write(layer_str) + + if "conv" in toks: f2.write("#Conv" + str(conv_id) + "," + str(layer_len) + "\n") diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..fec9712539bcb79fc880293e90d4864ba5bf0e4f --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/Makefile @@ -0,0 +1,83 @@ +DNN_BENCHMARK_ROOT = $(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks +# NOTE: can configure build directory +#HPVM_BUILD_DIR = $(LLVM_SRC_ROOT)/../build_hpvm/ +HPVM_BUILD_DIR = $(LLVM_BUILD_ROOT) + +CC = $(HPVM_BUILD_DIR)/bin/clang++ +OPT = $(HPVM_BUILD_DIR)/bin/opt +LLVM_DIS = $(HPVM_BUILD_DIR)/bin/llvm-dis +LLVM_LINK = $(HPVM_BUILD_DIR)/bin/llvm-link +LLVM_INCLUDE_DIR = $(LLVM_SRC_ROOT)/include + +SRC_DIR = src +BUILD_DIR = build +APP = alexnet + +TENSOR_INCLUDE_DIR = $(DNN_BENCHMARK_ROOT)/common/include +TENSOR_RT_INCLUDE_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/tensor_runtime/include +TENSOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_runtime.a +PROFILER_LIB_DIR = $(LLVM_SRC_ROOT)/projects/gpu_profiler/lib/libgpu_profiler.a +SOC_SIMULATOR_LIB_DIR = $(LLVM_SRC_ROOT)/projects/soc_simulator/lib/libpromise_profiler.a +TENSOR_AUTOTUNER_DIR = $(LLVM_SRC_ROOT)/projects/hpvm-tensor-rt/lib/libtensor_autotuner.a + +CC_FLAGS = -I $(LLVM_INCLUDE_DIR) -I $(TENSOR_INCLUDE_DIR) -I $(TENSOR_RT_INCLUDE_DIR) -I $(CUDA_INCLUDE_PATH) -fno-exceptions -ffast-math -std=c++11 -O3 +CCFLAGS += -DDEVICE=CUDNN_TARGET +LINKER_FLAGS = -lpthread -lcudart -lcurand -lcudnn -lcublas -lcufft -lOpenCL -lstdc++fs + +HPVM_LIB_DIR = $(HPVM_BUILD_DIR)/lib + + +VISC_OPTFLAGS = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG + + +PROMISE_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges.txt + +VISC_OPTFLAGS2 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_PROMISE.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_CUDNN.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-promise -quantization-levels-filename=$(PROMISE_QUANT_FILE_PATH) -dfg2llvm-cudnn -dfg2llvm-x86 -clearDFG + +WRAPPER_API_QUANT_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/quant_ranges_rt.txt +CONF_FILE_PATH=$(LLVM_SRC_ROOT)/test/VISC/DNN_Benchmarks/benchmarks/$(APP)/data/tuner_confs.txt + +VISC_OPTFLAGS3 = -load $(HPVM_LIB_DIR)/LLVMBuildDFG.so -load $(HPVM_LIB_DIR)/LLVMInPlaceDFGAnalysis.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_WrapperAPI.so -load $(HPVM_LIB_DIR)/LLVMDFG2LLVM_X86.so -load $(HPVM_LIB_DIR)/LLVMFuseHPVMTensorNodes.so -load $(HPVM_LIB_DIR)/LLVMClearDFG.so -inplace -hpvm-fuse -dfg2llvm-wrapperapi -quantization-levels-filename=$(WRAPPER_API_QUANT_FILE_PATH) -configuration-inputs-filename=$(CONF_FILE_PATH) -dfg2llvm-x86 -clearDFG + + +TARGET = $(BUILD_DIR)/$(APP).opt.bc +SOURCES = $(SRC_DIR)/$(APP).cpp +VISC_RT_PATH = $(LLVM_SRC_ROOT)/../build/projects/visc-rt/visc-rt.ll + +#OBJS = $(BUILD_DIR)/$(wildcabrd *.ll) +.PRECIOUS: $(BUILD_DIR)/$(APP).ll $(BUILD_DIR)/$(APP).visc.ll +default: $(BUILD_DIR) $(TARGET) + + +$(BUILD_DIR)/%.ll: $(SRC_DIR)/%.cpp + $(CC) $(CC_FLAGS) -emit-llvm src/$(APP).cpp -S -o $(BUILD_DIR)/$(APP).ll + #---- $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_promise.cpp -S -o $(BUILD_DIR)/$(APP)_promise.ll + #---- $(CC) $(CC_FLAGS) -emit-llvm src/$(APP)_loop.cpp -S -o $(BUILD_DIR)/$(APP)_loop.ll + +$(BUILD_DIR)/%.opt.bc: $(BUILD_DIR)/%.ll + $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP).ll -S -o $(BUILD_DIR)/$(APP).visc.ll + #----- $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_promise.ll -S -o $(BUILD_DIR)/$(APP)_promise.visc.ll + #----- $(OPT) -load LLVMGenVISC.so -genvisc -globaldce $(BUILD_DIR)/$(APP)_loop.ll -S -o $(BUILD_DIR)/$(APP)_loop.visc.ll + $(OPT) $(VISC_OPTFLAGS) $(BUILD_DIR)/$(APP).visc.ll -o $(BUILD_DIR)/$(APP)_cudnn.bc + #$(OPT) $(VISC_OPTFLAGS2) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_promise.bc + + #--- $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_promise.visc.ll -o $(BUILD_DIR)/$(APP)_wrapperapi.bc + #--- $(OPT) $(VISC_OPTFLAGS3) $(BUILD_DIR)/$(APP)_loop.visc.ll -o $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc + + $(LLVM_LINK) $(BUILD_DIR)/$(APP)_cudnn.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_cudnn_linked.bc + #$(LLVM_LINK) $(BUILD_DIR)/$(APP)_promise.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_promise_linked.bc + + #---- $(LLVM_LINK) $(BUILD_DIR)/$(APP)_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc + #--- $(LLVM_LINK) $(BUILD_DIR)/$(APP)_loop_wrapperapi.bc $(VISC_RT_PATH) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc + $(CC) $(BUILD_DIR)/$(APP)_cudnn_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_cudnn_linked $(LINKER_FLAGS) + #$(CC) $(BUILD_DIR)/$(APP)_promise_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_promise_linked $(LINKER_FLAGS) + + #--- $(CC) $(BUILD_DIR)/$(APP)_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_wrapperapi_linked $(LINKER_FLAGS) + #--- $(CC) $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked.bc $(TENSOR_LIB_DIR) $(PROFILER_LIB_DIR) $(SOC_SIMULATOR_LIB_DIR) -o $(BUILD_DIR)/$(APP)_loop_wrapperapi_linked $(LINKER_FLAGS) + + +$(BUILD_DIR): + mkdir -p $@ + +clean: + rm -rf $(BUILD_DIR) diff --git a/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/src/alexnet.cpp b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/src/alexnet.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5aae61e5f0673d260c32ad923bb1038f91a39a3b --- /dev/null +++ b/llvm/test/VISC/DNN_Benchmarks/benchmarks/alexnet_imagenet/src/alexnet.cpp @@ -0,0 +1,562 @@ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/stat.h> +#include <cstring> +#include <visc.h> +#include <tensorTypes.h> +#include <tensorUtils.h> + + +void var_0_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 2, 2, 4, 4); + __visc__return(2, r, (size_t) 0); +} + +void var_1_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_2_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_3_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); + __visc__return(2, r, (size_t) 0); +} + +void var_4_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 2, 2, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_5_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_6_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_7_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); + __visc__return(2, r, (size_t) 0); +} + +void var_8_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_9_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_10_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_11_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_12_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_13_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_14_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_convolution(t1, t2, 1, 1, 1, 1); + __visc__return(2, r, (size_t) 0); +} + +void var_15_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_16_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_17_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_pool_max(t1, 3, 3, 0, 0, 2, 2); + __visc__return(2, r, (size_t) 0); +} + +void var_18_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_mul(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_19_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_20_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_21_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_mul(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_22_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_23_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_relu(t1); + __visc__return(2, r, (size_t) 0); +} + +void var_24_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_mul(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_25_node(void* t1, size_t bytes_t1, void* t2, size_t bytes_t2) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(2, t1, t2, 0); + + void *r = __visc__tensor_add(t1, t2); + __visc__return(2, r, (size_t) 0); +} + +void var_26_node(void* t1, size_t bytes_t1) { + __visc__hint(visc::CUDNN_TARGET); + __visc__attributes(1, t1, 0); + + void* r = __visc__tensor_softmax(t1); + __visc__return(2, r, (size_t) 0); +} + +void root(void* input, size_t input_bytes, + void* conv2d_1_w, size_t conv2d_1_w_bytes, + void* conv2d_1_b, size_t conv2d_1_b_bytes, + void* conv2d_2_w, size_t conv2d_2_w_bytes, + void* conv2d_2_b, size_t conv2d_2_b_bytes, + void* conv2d_3_w, size_t conv2d_3_w_bytes, + void* conv2d_3_b, size_t conv2d_3_b_bytes, + void* conv2d_4_w, size_t conv2d_4_w_bytes, + void* conv2d_4_b, size_t conv2d_4_b_bytes, + void* conv2d_5_w, size_t conv2d_5_w_bytes, + void* conv2d_5_b, size_t conv2d_5_b_bytes, + void* dense_1_w, size_t dense_1_w_bytes, + void* dense_1_b, size_t dense_1_b_bytes, + void* dense_2_w, size_t dense_2_w_bytes, + void* dense_2_b, size_t dense_2_b_bytes, + void* dense_3_w, size_t dense_3_w_bytes, + void* dense_3_b, size_t dense_3_b_bytes){ + + + __visc__hint(visc::CPU_TARGET); + __visc__attributes(17, input, conv2d_1_w, conv2d_1_b, conv2d_2_w, conv2d_2_b, conv2d_3_w, conv2d_3_b, conv2d_4_w, conv2d_4_b, conv2d_5_w, conv2d_5_b, dense_1_w, dense_1_b, dense_2_w, dense_2_b, dense_3_w, dense_3_b, 0); + + + void* var_0 = __visc__createNodeND(0, var_0_node); + + __visc__bindIn(var_0, 0, 0, 0); + __visc__bindIn(var_0, 1, 1, 0); + __visc__bindIn(var_0, 2, 2, 0); + __visc__bindIn(var_0, 3, 3, 0); + + void* var_1 = __visc__createNodeND(0, var_1_node); + + __visc__edge(var_0, var_1, 1, 0, 0, 0); + __visc__edge(var_0, var_1, 1, 1, 1, 0); + __visc__bindIn(var_1, 4, 2, 0); + __visc__bindIn(var_1, 5, 3, 0); + + void* var_2 = __visc__createNodeND(0, var_2_node); + + __visc__edge(var_1, var_2, 1, 0, 0, 0); + __visc__edge(var_1, var_2, 1, 1, 1, 0); + + void* var_3 = __visc__createNodeND(0, var_3_node); + + __visc__edge(var_2, var_3, 1, 0, 0, 0); + __visc__edge(var_2, var_3, 1, 1, 1, 0); + + void* var_4 = __visc__createNodeND(0, var_4_node); + + __visc__edge(var_3, var_4, 1, 0, 0, 0); + __visc__edge(var_3, var_4, 1, 1, 1, 0); + __visc__bindIn(var_4, 6, 2, 0); + __visc__bindIn(var_4, 7, 3, 0); + + void* var_5 = __visc__createNodeND(0, var_5_node); + + __visc__edge(var_4, var_5, 1, 0, 0, 0); + __visc__edge(var_4, var_5, 1, 1, 1, 0); + __visc__bindIn(var_5, 8, 2, 0); + __visc__bindIn(var_5, 9, 3, 0); + + void* var_6 = __visc__createNodeND(0, var_6_node); + + __visc__edge(var_5, var_6, 1, 0, 0, 0); + __visc__edge(var_5, var_6, 1, 1, 1, 0); + + void* var_7 = __visc__createNodeND(0, var_7_node); + + __visc__edge(var_6, var_7, 1, 0, 0, 0); + __visc__edge(var_6, var_7, 1, 1, 1, 0); + + void* var_8 = __visc__createNodeND(0, var_8_node); + + __visc__edge(var_7, var_8, 1, 0, 0, 0); + __visc__edge(var_7, var_8, 1, 1, 1, 0); + __visc__bindIn(var_8, 10, 2, 0); + __visc__bindIn(var_8, 11, 3, 0); + + void* var_9 = __visc__createNodeND(0, var_9_node); + + __visc__edge(var_8, var_9, 1, 0, 0, 0); + __visc__edge(var_8, var_9, 1, 1, 1, 0); + __visc__bindIn(var_9, 12, 2, 0); + __visc__bindIn(var_9, 13, 3, 0); + + void* var_10 = __visc__createNodeND(0, var_10_node); + + __visc__edge(var_9, var_10, 1, 0, 0, 0); + __visc__edge(var_9, var_10, 1, 1, 1, 0); + + void* var_11 = __visc__createNodeND(0, var_11_node); + + __visc__edge(var_10, var_11, 1, 0, 0, 0); + __visc__edge(var_10, var_11, 1, 1, 1, 0); + __visc__bindIn(var_11, 14, 2, 0); + __visc__bindIn(var_11, 15, 3, 0); + + void* var_12 = __visc__createNodeND(0, var_12_node); + + __visc__edge(var_11, var_12, 1, 0, 0, 0); + __visc__edge(var_11, var_12, 1, 1, 1, 0); + __visc__bindIn(var_12, 16, 2, 0); + __visc__bindIn(var_12, 17, 3, 0); + + void* var_13 = __visc__createNodeND(0, var_13_node); + + __visc__edge(var_12, var_13, 1, 0, 0, 0); + __visc__edge(var_12, var_13, 1, 1, 1, 0); + + void* var_14 = __visc__createNodeND(0, var_14_node); + + __visc__edge(var_13, var_14, 1, 0, 0, 0); + __visc__edge(var_13, var_14, 1, 1, 1, 0); + __visc__bindIn(var_14, 18, 2, 0); + __visc__bindIn(var_14, 19, 3, 0); + + void* var_15 = __visc__createNodeND(0, var_15_node); + + __visc__edge(var_14, var_15, 1, 0, 0, 0); + __visc__edge(var_14, var_15, 1, 1, 1, 0); + __visc__bindIn(var_15, 20, 2, 0); + __visc__bindIn(var_15, 21, 3, 0); + + void* var_16 = __visc__createNodeND(0, var_16_node); + + __visc__edge(var_15, var_16, 1, 0, 0, 0); + __visc__edge(var_15, var_16, 1, 1, 1, 0); + + void* var_17 = __visc__createNodeND(0, var_17_node); + + __visc__edge(var_16, var_17, 1, 0, 0, 0); + __visc__edge(var_16, var_17, 1, 1, 1, 0); + + void* var_18 = __visc__createNodeND(0, var_18_node); + + __visc__edge(var_17, var_18, 1, 0, 0, 0); + __visc__edge(var_17, var_18, 1, 1, 1, 0); + __visc__bindIn(var_18, 22, 2, 0); + __visc__bindIn(var_18, 23, 3, 0); + + void* var_19 = __visc__createNodeND(0, var_19_node); + + __visc__edge(var_18, var_19, 1, 0, 0, 0); + __visc__edge(var_18, var_19, 1, 1, 1, 0); + __visc__bindIn(var_19, 24, 2, 0); + __visc__bindIn(var_19, 25, 3, 0); + + void* var_20 = __visc__createNodeND(0, var_20_node); + + __visc__edge(var_19, var_20, 1, 0, 0, 0); + __visc__edge(var_19, var_20, 1, 1, 1, 0); + + void* var_21 = __visc__createNodeND(0, var_21_node); + + __visc__edge(var_20, var_21, 1, 0, 0, 0); + __visc__edge(var_20, var_21, 1, 1, 1, 0); + __visc__bindIn(var_21, 26, 2, 0); + __visc__bindIn(var_21, 27, 3, 0); + + void* var_22 = __visc__createNodeND(0, var_22_node); + + __visc__edge(var_21, var_22, 1, 0, 0, 0); + __visc__edge(var_21, var_22, 1, 1, 1, 0); + __visc__bindIn(var_22, 28, 2, 0); + __visc__bindIn(var_22, 29, 3, 0); + + void* var_23 = __visc__createNodeND(0, var_23_node); + + __visc__edge(var_22, var_23, 1, 0, 0, 0); + __visc__edge(var_22, var_23, 1, 1, 1, 0); + + void* var_24 = __visc__createNodeND(0, var_24_node); + + __visc__edge(var_23, var_24, 1, 0, 0, 0); + __visc__edge(var_23, var_24, 1, 1, 1, 0); + __visc__bindIn(var_24, 30, 2, 0); + __visc__bindIn(var_24, 31, 3, 0); + + void* var_25 = __visc__createNodeND(0, var_25_node); + + __visc__edge(var_24, var_25, 1, 0, 0, 0); + __visc__edge(var_24, var_25, 1, 1, 1, 0); + __visc__bindIn(var_25, 32, 2, 0); + __visc__bindIn(var_25, 33, 3, 0); + + void* var_26 = __visc__createNodeND(0, var_26_node); + + __visc__edge(var_25, var_26, 1, 0, 0, 0); + __visc__edge(var_25, var_26, 1, 1, 1, 0); + + __visc__bindOut(var_26, 0, 0, 0); + __visc__bindOut(var_26, 1, 1, 0); + +} + +struct ret_t { + void* tensor; + size_t bytes; +}; + +typedef struct __attribute__((__packed__)) { + void* input; + size_t input_bytes; + void* conv2d_1_w; + size_t conv2d_1_w_bytes; + void* conv2d_1_b; + size_t conv2d_1_b_bytes; + void* conv2d_2_w; + size_t conv2d_2_w_bytes; + void* conv2d_2_b; + size_t conv2d_2_b_bytes; + void* conv2d_3_w; + size_t conv2d_3_w_bytes; + void* conv2d_3_b; + size_t conv2d_3_b_bytes; + void* conv2d_4_w; + size_t conv2d_4_w_bytes; + void* conv2d_4_b; + size_t conv2d_4_b_bytes; + void* conv2d_5_w; + size_t conv2d_5_w_bytes; + void* conv2d_5_b; + size_t conv2d_5_b_bytes; + void* dense_1_w; + size_t dense_1_w_bytes; + void* dense_1_b; + size_t dense_1_b_bytes; + void* dense_2_w; + size_t dense_2_w_bytes; + void* dense_2_b; + size_t dense_2_b_bytes; + void* dense_3_w; + size_t dense_3_w_bytes; + void* dense_3_b; + size_t dense_3_b_bytes; + + struct ret_t r; +} +RootIn; + + +int main(){ + + std::string dir_prefix = std::string("/shared/hsharif3/alexnet_imagenet_tune/"); + std::string input_path = dir_prefix + std::string("test_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + std::string conv2d_1_w_path = dir_prefix + std::string("conv2d_1_w.bin"); + void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); + std::string conv2d_1_b_path = dir_prefix + std::string("conv2d_1_b.bin"); + void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); + std::string conv2d_2_w_path = dir_prefix + std::string("conv2d_2_w.bin"); + void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); + std::string conv2d_2_b_path = dir_prefix + std::string("conv2d_2_b.bin"); + void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); + std::string conv2d_3_w_path = dir_prefix + std::string("conv2d_3_w.bin"); + void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); + std::string conv2d_3_b_path = dir_prefix + std::string("conv2d_3_b.bin"); + void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); + std::string conv2d_4_w_path = dir_prefix + std::string("conv2d_4_w.bin"); + void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); + std::string conv2d_4_b_path = dir_prefix + std::string("conv2d_4_b.bin"); + void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); + std::string conv2d_5_w_path = dir_prefix + std::string("conv2d_5_w.bin"); + void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); + std::string conv2d_5_b_path = dir_prefix + std::string("conv2d_5_b.bin"); + void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); + std::string dense_1_w_path = dir_prefix + std::string("dense_1_w.bin"); + void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,9216,4096); + std::string dense_1_b_path = dir_prefix + std::string("dense_1_b.bin"); + void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,4096,1,1); + std::string dense_2_w_path = dir_prefix + std::string("dense_2_w.bin"); + void* dense_2_w = readTrainedWeights(dense_2_w_path.c_str(), 0,1,1,4096,4096); + std::string dense_2_b_path = dir_prefix + std::string("dense_2_b.bin"); + void* dense_2_b = readTrainedWeights(dense_2_b_path.c_str(), 0,1,4096,1,1); + std::string dense_3_w_path = dir_prefix + std::string("dense_3_w.bin"); + void* dense_3_w = readTrainedWeights(dense_3_w_path.c_str(), 0,1,1,4096,1000); + std::string dense_3_b_path = dir_prefix + std::string("dense_3_b.bin"); + void* dense_3_b = readTrainedWeights(dense_3_b_path.c_str(), 0,1,1000,1,1); + void* input = readTrainedWeights(input_path.c_str(), 0, 1000,3,224,224); + //uint32_t* labels = readLabels2(labels_path.c_str(),6000); + + uint32_t* labels = readLabels3(labels_path.c_str(), 1000); + + + __visc__init(); + RootIn* args = static_cast<RootIn*>(malloc(sizeof(RootIn))); + + args->input = input; + args->input_bytes = 0; + args->conv2d_1_w = conv2d_1_w; + args->conv2d_1_w_bytes = 0; + args->conv2d_1_b = conv2d_1_b; + args->conv2d_1_b_bytes = 0; + args->conv2d_2_w = conv2d_2_w; + args->conv2d_2_w_bytes = 0; + args->conv2d_2_b = conv2d_2_b; + args->conv2d_2_b_bytes = 0; + args->conv2d_3_w = conv2d_3_w; + args->conv2d_3_w_bytes = 0; + args->conv2d_3_b = conv2d_3_b; + args->conv2d_3_b_bytes = 0; + args->conv2d_4_w = conv2d_4_w; + args->conv2d_4_w_bytes = 0; + args->conv2d_4_b = conv2d_4_b; + args->conv2d_4_b_bytes = 0; + args->conv2d_5_w = conv2d_5_w; + args->conv2d_5_w_bytes = 0; + args->conv2d_5_b = conv2d_5_b; + args->conv2d_5_b_bytes = 0; + args->dense_1_w = dense_1_w; + args->dense_1_w_bytes = 0; + args->dense_1_b = dense_1_b; + args->dense_1_b_bytes = 0; + args->dense_2_w = dense_2_w; + args->dense_2_w_bytes = 0; + args->dense_2_b = dense_2_b; + args->dense_2_b_bytes = 0; + args->dense_3_w = dense_3_w; + args->dense_3_w_bytes = 0; + args->dense_3_b = dense_3_b; + args->dense_3_b_bytes = 0; + + void* dfg = __visc__launch(0, root, (void*) args); + + __visc__wait(dfg); + + void *result = static_cast<RootIn*>(args)->input; + hpvm_request_tensor(result, 0); + + __visc__cleanup(); + computeAccuracy3(labels, result); + return 0; + +}