diff --git a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc index 0aa33bc43dace1f847f44ed7ad6dcfc0082d014a..65d6335f75fb5f3e9469e42507e063a2b526aee8 100644 --- a/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc +++ b/llvm/projects/hpvm-tensor-rt/dnn_sources/src/alexnet_canny.cc @@ -4,15 +4,18 @@ #include <unistd.h> #include <fcntl.h> #include <sys/types.h> -#include <sys/stat.h> +#include <sys/stat.h> +#include <vector> #include <string.h> #include "tensor_runtime.h" #include "utils.h" +#include "tensor_custom_ops_cpu.h" -Tensor *gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) { + +Tensor* gaussianFilter(float sigma, size_t w, size_t h, size_t n_chan) { int64_t m = (w - 1) / 2, n = (h - 1) / 2; auto *data = new float[w * h]; float sum = 0.0f; @@ -57,64 +60,50 @@ TODOs: ****/ void* canny_filter(void* dataset) { - Tensor *gaussian = gaussianFilter(1.4, 5, 5, 1); - Tensor *kernel_x, *kernel_y; + + Tensor* gaussian = gaussianFilter(1.4, 5, 5, 1); + Tensor* kernel_x, *kernel_y; std::tie(kernel_x, kernel_y) = getSobelKernels(); // 0. Grayscale - auto *summed_image = autotuner_tensorReduce(dataset, 1, MathOp::Add); - auto *grayscale_image = autotuner_tensorMap1(MathOp::Avg3, summed_image); + auto* summed_image = tensorReduce(dataset, 1, MathOp::Add); + auto* grayscale_image = tensorMap1(MathOp::Avg3, summed_image); // 1. Denoise - auto *image2 = ConvLayer_PROMISE(grayscale_image, 0.0, 0.0, gaussian, - 0.0, 0.0, nullptr, 0.0, 0.0, 2, 2, 1, - 1, 0, 0, -1, 0.0, 0.0, 0); + + auto* image2 = tensorConvolution(grayscale_image, gaussian, + 2, 2, // padding + 1, 1, // strides + 1, 0); // conv_mode, conv_groups + // 2. Get edge gradient / direction - auto *grad_x = ConvLayer_PROMISE( - image2, 0.0, 0.0, kernel_x, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0, - -1, 0.0, 0.0, 0); - auto *grad_y = ConvLayer_PROMISE( - image2, 0.0, 0.0, kernel_y, 0.0, 0.0, nullptr, 0.0, 0.0, 1, 1, 1, 1, 0, 0, - -1, 0.0, 0.0, 0); - auto *grad_mag = autotuner_tensorMap2(MathOp::Hypot, grad_x, grad_y); + auto *grad_x = tensorConvolution(image2, kernel_x, + 1, 1, + 1, 1, + 1, 0); + + auto *grad_y = tensorConvolution(image2, kernel_y, + 1, 1, + 1, 1, + 1, 0); + + auto *grad_mag = tensorMap2(MathOp::Hypot, grad_x, grad_y); // 2.5. Normalize grad magnitude - auto *grad_max_1D = autotuner_tensorReduce(grad_mag, 2, MathOp::Max); - auto *grad_max = autotuner_tensorReduce(grad_max_1D, 3, MathOp::Max); - auto *grad_mag_norm = autotuner_tensorMap2(MathOp::Div, grad_mag, grad_max); + auto *grad_max_1D = tensorReduce(grad_mag, 2, MathOp::Max); + auto *grad_max = tensorReduce(grad_max_1D, 3, MathOp::Max); + auto *grad_mag_norm = tensorMap2(MathOp::Div, grad_mag, grad_max); return grad_mag_norm; } -const size_t batch_size = 500, total_max = 3000; -const float psnr_threshold = 30.0; +void* invoke_canny(void* input) { + + auto* result = canny_filter(input); -int main() { - const char *input_path = "../model_params/image_processing_5k"; - const char *ref_output_path = "../model_params/canny_ref_output"; - std::vector<float> psnr; - llvm_hpvm_initTensorRt(1); - size_t bstart = 0; - startMemTracking(); - while (true) { - Tensor *batch = readDataSet(input_path, bstart, batch_size); - if (batch == nullptr) - break; - - auto *result = main_procedure(batch); - auto *ref_output = readDataSet(ref_output_path, bstart, batch_size, 1); - std::vector<float> psnr_batch = PSNR(ref_output, result); - std::copy(psnr_batch.begin(), psnr_batch.end(), std::back_inserter(psnr)); - bstart += batch_size; - if (bstart >= total_max) - break; - freeBatchMemory(); - } - float violation = violationRate(psnr, psnr_threshold); - float mean_psnr = mean(psnr); - std::ofstream of("final_accuracy"); - of << violation * 100 << ", " << mean_psnr << '\n'; - return 0; + printf("Done with Canny \n"); + + return result; } @@ -128,84 +117,128 @@ int main(){ llvm_hpvm_initTensorRt(0); - - //std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); - std::string dir_prefix = std::string("../model_params/alexnet_cifar10_front/"); + std::string dir_prefix = std::string("../model_params/alexnet2_cifar10/"); + std::string input_path = dir_prefix + std::string("norm_cifar_input.bin"); + std::string labels_path = dir_prefix + std::string("test_labels.bin"); + + void* conv1_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv1.bin", + float_type, 32, 3, 3, 3); + void* conv1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv1_bias.bin", + float_type, 1, 32, 1, 1); + void* conv2_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv2.bin", + float_type, 32, 32, 3, 3); + void* conv2_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv2_bias.bin", + float_type, 1, 32, 1, 1); + void* conv3_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv3.bin", + float_type, 64, 32, 3, 3); + void* conv3_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv3_bias.bin", + float_type, 1, 64, 1, 1); + void* conv4_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv4.bin", + float_type, 64, 64, 3, 3); + void* conv4_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv4_bias.bin", + float_type, 1, 64, 1, 1); + void* conv5_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv5.bin", + float_type, 128, 64, 3, 3); + void* conv5_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv5_bias.bin", + float_type, 1, 128, 1, 1); + void* conv6_filter = readTrainedWeights("../model_params/alexnet2_cifar10/conv6.bin", + float_type, 128, 128, 3, 3); + void* conv6_bias = readTrainedWeights("../model_params/alexnet2_cifar10/conv6_bias.bin", + float_type, 1, 128, 1, 1); + + void* fc1_weights = readTrainedWeights("../model_params/alexnet2_cifar10/fc1.bin", + float_type, 1, 1, 2048, 10); + void* fc1_bias = readTrainedWeights("../model_params/alexnet2_cifar10/fc1_bias.bin", + float_type, 1, 10, 1, 1); - std::string input_path = dir_prefix + std::string("input.bin"); - //void* input = readTrainedWeights(input_path.c_str(), 0,10000,3,32,32); - std::string labels_path = dir_prefix + std::string("labels.bin"); - //uint8_t* labels = readLabels(labels_path.c_str(),10000); - std::string conv2d_1_w_path = dir_prefix + std::string("conv0.bin"); - void* conv2d_1_w = readTrainedWeights(conv2d_1_w_path.c_str(), 0,64,3,11,11); - std::string conv2d_1_b_path = dir_prefix + std::string("conv_bias0.bin"); - void* conv2d_1_b = readTrainedWeights(conv2d_1_b_path.c_str(), 0,1,64,1,1); - std::string conv2d_2_w_path = dir_prefix + std::string("conv3.bin"); - void* conv2d_2_w = readTrainedWeights(conv2d_2_w_path.c_str(), 0,192,64,5,5); - std::string conv2d_2_b_path = dir_prefix + std::string("conv_bias3.bin"); - void* conv2d_2_b = readTrainedWeights(conv2d_2_b_path.c_str(), 0,1,192,1,1); - std::string conv2d_3_w_path = dir_prefix + std::string("conv6.bin"); - void* conv2d_3_w = readTrainedWeights(conv2d_3_w_path.c_str(), 0,384,192,3,3); - std::string conv2d_3_b_path = dir_prefix + std::string("conv_bias6.bin"); - void* conv2d_3_b = readTrainedWeights(conv2d_3_b_path.c_str(), 0,1,384,1,1); - std::string conv2d_4_w_path = dir_prefix + std::string("conv7.bin"); - void* conv2d_4_w = readTrainedWeights(conv2d_4_w_path.c_str(), 0,256,384,3,3); - std::string conv2d_4_b_path = dir_prefix + std::string("conv_bias7.bin"); - void* conv2d_4_b = readTrainedWeights(conv2d_4_b_path.c_str(), 0,1,256,1,1); - std::string conv2d_5_w_path = dir_prefix + std::string("conv8.bin"); - void* conv2d_5_w = readTrainedWeights(conv2d_5_w_path.c_str(), 0,256,256,3,3); - std::string conv2d_5_b_path = dir_prefix + std::string("conv_bias8.bin"); - void* conv2d_5_b = readTrainedWeights(conv2d_5_b_path.c_str(), 0,1,256,1,1); - std::string dense_1_w_path = dir_prefix + std::string("fc12.bin"); - void* dense_1_w = readTrainedWeights(dense_1_w_path.c_str(), 0,1,1,4096,10); - std::string dense_1_b_path = dir_prefix + std::string("fc_bias12.bin"); - void* dense_1_b = readTrainedWeights(dense_1_b_path.c_str(), 0,1,10,1,1); - - - startMemTracking(); - int test_input_size = 2000; - int batch_size = 2000; + int test_input_size = 5000; + int batch_size = 500; int batch_count = test_input_size / batch_size; float final_accuracy = 0.0; // NOTE: Starting time profiling - startProfiling(); - + startProfiling(); + startMemTracking(); + + int conv_mode = 1; // NOTE: using CROSS_CORRELATION + int conv_precision = 0; // NOTE: using Float as compute precision. FIXIT: use enum + for(int i = 0; i < batch_count; i++){ int start = i * batch_size; int end = (i + 1) * batch_size; - void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); - - void* var_0 = tensorConvolution(input, conv2d_1_w, 5, 5, 1, 1, 1, 0); - void* var_1 = tensorAdd(var_0, conv2d_1_b); - void* var_2 = tensorTanh(var_1); - void* var_3 = tensorPooling(var_2,0,2,2,0,0,2,2); - void* var_5 = tensorConvolution(var_3, conv2d_2_w, 2, 2, 1, 1, 1, 0); - void* var_6 = tensorAdd(var_5, conv2d_2_b); - void* var_7 = tensorTanh(var_6); - void* var_8 = tensorPooling(var_7,0,2,2,0,0,2,2); - void* var_10 = tensorConvolution(var_8, conv2d_3_w, 1, 1, 1, 1, 1, 0); - void* var_11 = tensorAdd(var_10, conv2d_3_b); - void* var_12 = tensorTanh(var_11); - void* var_13 = tensorConvolution(var_12, conv2d_4_w, 1, 1, 1, 1, 1, 0); - void* var_14 = tensorAdd(var_13, conv2d_4_b); - void* var_15 = tensorTanh(var_14); - void* var_16 = tensorConvolution(var_15, conv2d_5_w, 1, 1, 1, 1, 1, 0); - void* var_17 = tensorAdd(var_16, conv2d_5_b); - void* var_18 = tensorTanh(var_17); - void* var_19 = tensorPooling(var_18,0,2,2,0,0,2,2); - void* var_22 = tensorGemmGPU(var_19, dense_1_w); - void* var_23 = tensorAdd(var_22, dense_1_b); - void* var_24 = tensorSoftmax(var_23); + + + void* input = readInputBatch(input_path.c_str(), 0,start,end,3,32,32); + + void* conv1out = tensorConvolution(input, conv1_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv1out, conv1_bias); + void* conv1_tanh = tensorTanh(conv1out); + + // 2nd Layer + void* conv2out = tensorConvolution(conv1_tanh, conv2_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv2out, conv2_bias); + void* conv2_tanh = tensorTanh(conv2out); + void* pool2out = tensorPooling(conv2_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 3rd Layer + void* conv3out = tensorConvolution(pool2out, conv3_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv3out, conv3_bias); + void* conv3_tanh = tensorTanh(conv3out); + + // 4th Layer + void* conv4out = tensorConvolution(conv3_tanh, conv4_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv4out, conv4_bias); + void* conv4_tanh = tensorTanh(conv4out); + void* pool4out = tensorPooling(conv4_tanh, 0, 2, 2, 0, 0, 2, 2); + + // 5th Layer + void* conv5out = tensorConvolution(pool4out, conv5_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv5out, conv5_bias); + void* conv5_tanh = tensorTanh(conv5out); + + // 6th Layer + void* conv6out = tensorConvolution(conv5_tanh, conv6_filter, 1, 1, 1, 1, + conv_mode, conv_precision); + tensorAdd(conv6out, conv6_bias); + + void* conv6_tanh = tensorTanh(conv6out); + void* pool6out = tensorPooling(conv6_tanh, 0, 2, 2, 0, 0, 2, 2); + + // final FC Layer + void* gemm1out = tensorGemmGPU(pool6out, fc1_weights); + void* gemm1biasout = tensorAdd(gemm1out, fc1_bias); + void* result = tensorSoftmax(gemm1biasout); uint8_t* labels = readLabelsBatch(labels_path.c_str(), start, end); - float accuracy = computeAccuracy2(labels,batch_size,var_24); + float accuracy = computeAccuracy2(labels, batch_size, result); final_accuracy += accuracy; + + + std::vector<int> index_vector; + index_vector.push_back(1); + index_vector.push_back(2); + index_vector.push_back(3); + index_vector.push_back(4); + index_vector.push_back(5); - freeBatchMemory(); + + void* argmax_out = tensorArgMax(result); + void* select_out = tensorSelect2(argmax_out, index_vector); + void* reduced_input = tensorContract(input, select_out); + + + invoke_canny(reduced_input); + + + freeBatchMemory(); } stopProfiling(); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h index fe8927f289deecb3a00b39bcc86377d122f7ef2a..b9128c1a24ca5bd95a7e6fb9e962d56501558f8f 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/tensor_custom_ops_cpu.h @@ -2,6 +2,7 @@ #include "tensor.h" #include <stdlib.h> +#include <vector> void* tensorArgMax(void* input_ptr){ @@ -76,6 +77,47 @@ void* tensorSelect(void* input_ptr, float target_value){ + +void* tensorSelect2(void* input_ptr, std::vector<int> index_vector){ + + Tensor* input = (Tensor*) input_ptr; + float* host_ptr = (float*) input->host_data; + + int batch_size = input->dims.dim_sizes[0]; + int channels = input->dims.dim_sizes[1]; + + if (channels != 1){ + printf("* Channels dimension must be 1 \n"); + abort(); + } + + Tensor* output = (Tensor *) create4DTensor(0, 0, batch_size, 1, 1, 1); + changeTensorPlacement(output, HOST); + float* out_ptr = (float*) output->host_data; + + for(int i = 0; i < batch_size; i++){ + + for(int j = 0; j < index_vector.size(); j++){ + int target_value = index_vector[j]; + if (host_ptr[i] == target_value){ + out_ptr[i] = 1; + break; + } + else{ + out_ptr[i] = 0; + } + } + + } + + return output; +} + + + + + + long getOnesInVector(float* vector_host_ptr, long vector_length){ long ones_count = 0;