<link crossorigin="anonymous" href="https://assets-cdn.github.com/assets/frameworks-cfd028037d00b8a80c5a3b91967e29dcc41309e44d4ccece88663a52963ca765.css" integrity="sha256-z9AoA30AuKgMWjuRln4p3MQTCeRNTM7OiGY6UpY8p2U=" media="all" rel="stylesheet" />
<link crossorigin="anonymous" href="https://assets-cdn.github.com/assets/github-dc8832cbc25013ebcdcca24529e402cfcea50e4b2bd136b8284fdf8864873636.css" integrity="sha256-3Igyy8JQE+vNzKJFKeQCz86lDksr0Ta4KE/fiGSHNjY=" media="all" rel="stylesheet" />
<link crossorigin="anonymous" href="https://assets-cdn.github.com/assets/site-9e0f35305336555b58884b07a160747fc1f6dbd79e13e18820a598a9abcb2662.css" integrity="sha256-ng81MFM2VVtYiEsHoWB0f8H229eeE+GIIKWYqavLJmI=" media="all" rel="stylesheet" />
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta http-equiv="Content-Language" content="en">
<meta name="viewport" content="width=device-width">
<title>ece408project/README.md at master · webgpu/ece408project · GitHub</title>
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="GitHub">
<link rel="fluid-icon" href="https://github.com/fluidicon.png" title="GitHub">
<link rel="apple-touch-icon" href="/apple-touch-icon.png">
<link rel="apple-touch-icon" sizes="57x57" href="/apple-touch-icon-57x57.png">
<link rel="apple-touch-icon" sizes="60x60" href="/apple-touch-icon-60x60.png">
<link rel="apple-touch-icon" sizes="72x72" href="/apple-touch-icon-72x72.png">
<link rel="apple-touch-icon" sizes="76x76" href="/apple-touch-icon-76x76.png">
<link rel="apple-touch-icon" sizes="114x114" href="/apple-touch-icon-114x114.png">
<link rel="apple-touch-icon" sizes="120x120" href="/apple-touch-icon-120x120.png">
<link rel="apple-touch-icon" sizes="144x144" href="/apple-touch-icon-144x144.png">
<link rel="apple-touch-icon" sizes="152x152" href="/apple-touch-icon-152x152.png">
<link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon-180x180.png">
<meta property="fb:app_id" content="1401488693436528">
<meta content="https://avatars0.githubusercontent.com/u/17249912?v=3&s=400" name="twitter:image:src" /><meta content="@github" name="twitter:site" /><meta content="summary" name="twitter:card" /><meta content="webgpu/ece408project" name="twitter:title" /><meta content="ece408project - The goal of this project is to accelerate the forward propagation step of the Convolutional Neural Network (CNN) algorithm using GPUs. " name="twitter:description" />
<meta content="https://avatars0.githubusercontent.com/u/17249912?v=3&s=400" property="og:image" /><meta content="GitHub" property="og:site_name" /><meta content="object" property="og:type" /><meta content="webgpu/ece408project" property="og:title" /><meta content="https://github.com/webgpu/ece408project" property="og:url" /><meta content="ece408project - The goal of this project is to accelerate the forward propagation step of the Convolutional Neural Network (CNN) algorithm using GPUs. " property="og:description" />
<meta name="browser-stats-url" content="https://api.github.com/_private/browser/stats">
<meta name="browser-errors-url" content="https://api.github.com/_private/browser/errors">
<link rel="assets" href="https://assets-cdn.github.com/">
<meta name="pjax-timeout" content="1000">
<meta name="request-id" content="62D490BD:557B:4925FB3:5855D342" data-pjax-transient>
<meta name="msapplication-TileImage" content="/windows-tile.png">
<meta name="msapplication-TileColor" content="#ffffff">
<meta name="selected-link" value="repo_source" data-pjax-transient>
<meta name="google-site-verification" content="KT5gs8h0wvaagLKAVWq8bbeNwnZZK1r1XQysX3xurLU">
<meta name="hostname" content="github.com">
<meta name="user-login" content="">
<meta name="expected-hostname" content="github.com">
<meta name="js-proxy-site-detection-payload" content="MmUwZjBlZDA0NzhmYjNlYzdlYmI3NDJjZmJjMzNjNTlmY2FmMDE2YjQ4MjBjMDk5Njk4OGIxNzk0M2ZjZDBiZHx7InJlbW90ZV9hZGRyZXNzIjoiOTguMjEyLjE0NC4xODkiLCJyZXF1ZXN0X2lkIjoiNjJENDkwQkQ6NTU3Qjo0OTI1RkIzOjU4NTVEMzQyIiwidGltZXN0YW1wIjoxNDgyMDE5NjUwLCJob3N0IjoiZ2l0aHViLmNvbSJ9">
<link rel="mask-icon" href="https://assets-cdn.github.com/pinned-octocat.svg" color="#000000">
<link rel="icon" type="image/x-icon" href="https://assets-cdn.github.com/favicon.ico">
<meta name="html-safe-nonce" content="f0cb0015334d1aa9d75b5545ac87cf97b04b06f0">
<meta content="bc4c4295c56de55124876b7fd5ddcbeaddca7d1f" name="form-nonce" />
<meta http-equiv="x-pjax-version" content="8cdbb8cffd6956425aaad8ad14e0496b">
<link rel="canonical" href="https://github.com/webgpu/ece408project/blob/master/README.md" data-pjax-transient>
<header class="site-header js-details-container" role="banner">
<button class="btn-link float-right site-header-toggle js-details-target" type="button" aria-label="Toggle navigation">
<svg aria-hidden="true" class="octicon octicon-three-bars" height="24" version="1.1" viewBox="0 0 12 16" width="18"><path fill-rule="evenodd" d="M11.41 9H.59C0 9 0 8.59 0 8c0-.59 0-1 .59-1H11.4c.59 0 .59.41.59 1 0 .59 0 1-.59 1h.01zm0-4H.59C0 5 0 4.59 0 4c0-.59 0-1 .59-1H11.4c.59 0 .59.41.59 1 0 .59 0 1-.59 1h.01zM.59 11H11.4c.59 0 .59.41.59 1 0 .59 0 1-.59 1H.59C0 13 0 12.59 0 12c0-.59 0-1 .59-1z"/></svg>
</button>
<div class="site-header-menu">
<nav class="site-header-nav site-header-nav-main">
<a href="/personal" class="js-selected-navigation-item nav-item nav-item-personal" data-ga-click="Header, click, Nav menu - item:personal" data-selected-links="/personal /personal">
Personal
<div class="site-header-actions">
<a class="btn btn-primary site-header-actions-btn" href="/join?source=header-repo" data-ga-click="(Logged out) Header, clicked Sign up, text:sign-up">Sign up</a>
<a class="btn site-header-actions-btn mr-1" href="/login?return_to=%2Fwebgpu%2Fece408project%2Fblob%2Fmaster%2FREADME.md" data-ga-click="(Logged out) Header, clicked Sign in, text:sign-in">Sign in</a>
</div>
<nav class="site-header-nav site-header-nav-secondary mr-md-3">
<a class="nav-item" href="/pricing">Pricing</a>
<a class="nav-item" href="/blog">Blog</a>
<a class="nav-item" href="https://help.github.com">Support</a>
<a class="nav-item header-search-link" href="https://github.com/search">Search GitHub</a>
<div class="header-search scoped-search site-scoped-search js-site-search" role="search">
<label class="form-control header-search-wrapper js-chromeless-input-container">
<div class="header-search-scope">This repository</div>
<input type="text"
class="form-control header-search-input js-site-search-focus js-site-search-field is-clearable"
data-hotkey="s"
name="q"
placeholder="Search"
aria-label="Search this repository"
data-unscoped-placeholder="Search GitHub"
data-scoped-placeholder="Search"
autocapitalize="off">
</label>
</nav>
</div>
<div id="start-of-content" class="accessibility-aid"></div>
<div id="js-flash-container">
<div role="main">
<div itemscope itemtype="http://schema.org/SoftwareSourceCode">
<div id="js-repo-pjax-container" data-pjax-container>
- Watch 6
-
Star
<a class="social-count js-social-count" href="/webgpu/ece408project/stargazers" aria-label="2 users starred this repository"> 2 </a>
-
Fork
<a href="/webgpu/ece408project/network" class="social-count" aria-label="11 users forked this repository"> 11 </a>
<h1 class="public ">
<span itemscope itemtype="http://schema.org/ListItem" itemprop="itemListElement">
<a href="/webgpu/ece408project/issues" class="js-selected-navigation-item reponav-item" data-hotkey="g i" data-selected-links="repo_issues repo_labels repo_milestones /webgpu/ece408project/issues" itemprop="url">
<svg aria-hidden="true" class="octicon octicon-issue-opened" height="16" version="1.1" viewBox="0 0 14 16" width="14"><path fill-rule="evenodd" d="M7 2.3c3.14 0 5.7 2.56 5.7 5.7s-2.56 5.7-5.7 5.7A5.71 5.71 0 0 1 1.3 8c0-3.14 2.56-5.7 5.7-5.7zM7 1C3.14 1 0 4.14 0 8s3.14 7 7 7 7-3.14 7-7-3.14-7-7-7zm1 3H6v5h2V4zm0 6H6v2h2v-2z"/></svg>
<span itemprop="name">Issues</span>
<span class="counter">0</span>
<meta itemprop="position" content="2">
Pull requests 0 Projects 0 Pulse Graphs
type="button" aria-label="Switch branches or tags" tabindex="0" aria-haspopup="true">
<i>Branch:</i>
<span class="js-select-button css-truncate-target">master</span>
<div class="commit-tease-contributors">
<button type="button" class="btn-link muted-link contributors-toggle" data-facebox="#blob_contributors_box">
<strong>3</strong>
contributors
</button>
<a class="avatar-link tooltipped tooltipped-s" aria-label="abduld" href="/webgpu/ece408project/commits/master/README.md?author=abduld"><img alt="@abduld" class="avatar" height="20" src="https://avatars3.githubusercontent.com/u/1404191?v=3&s=40" width="20" /> </a>
<a class="avatar-link tooltipped tooltipped-s" aria-label="cli99" href="/webgpu/ece408project/commits/master/README.md?author=cli99"><img alt="@cli99" class="avatar" height="20" src="https://avatars0.githubusercontent.com/u/17418037?v=3&s=40" width="20" /> </a>
<a class="avatar-link tooltipped tooltipped-s" aria-label="cwpearson" href="/webgpu/ece408project/commits/master/README.md?author=cwpearson"><img alt="@cwpearson" class="avatar" height="20" src="https://avatars2.githubusercontent.com/u/6765756?v=3&s=40" width="20" /> </a>
</div>
<div id="blob_contributors_box" style="display:none">
<h2 class="facebox-header" data-facebox-id="facebox-header">Users who have contributed to this file</h2>
<ul class="facebox-user-list" data-facebox-id="facebox-description">
<li class="facebox-user-list-item">
<img alt="@abduld" height="24" src="https://avatars1.githubusercontent.com/u/1404191?v=3&s=48" width="24" />
<a href="/abduld">abduld</a>
</li>
<li class="facebox-user-list-item">
<img alt="@cli99" height="24" src="https://avatars2.githubusercontent.com/u/17418037?v=3&s=48" width="24" />
<a href="/cli99">cli99</a>
</li>
<li class="facebox-user-list-item">
<img alt="@cwpearson" height="24" src="https://avatars0.githubusercontent.com/u/6765756?v=3&s=48" width="24" />
<a href="/cwpearson">cwpearson</a>
</li>
</ul>
</div>
<div class="BtnGroup">
<a href="/webgpu/ece408project/raw/master/README.md" class="btn btn-sm BtnGroup-item" id="raw-url">Raw</a>
<a href="/webgpu/ece408project/blame/master/README.md" class="btn btn-sm js-update-url-with-hash BtnGroup-item">Blame</a>
<a href="/webgpu/ece408project/commits/master/README.md" class="btn btn-sm BtnGroup-item" rel="nofollow">History</a>
</div>
<a class="btn-octicon tooltipped tooltipped-nw"
href="https://windows.github.com"
aria-label="Open this file in GitHub Desktop"
data-ga-click="Repository, open with desktop, type:windows">
<svg aria-hidden="true" class="octicon octicon-device-desktop" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M15 2H1c-.55 0-1 .45-1 1v9c0 .55.45 1 1 1h5.34c-.25.61-.86 1.39-2.34 2h8c-1.48-.61-2.09-1.39-2.34-2H15c.55 0 1-.45 1-1V3c0-.55-.45-1-1-1zm0 9H1V3h14v8z"/></svg>
</a>
<button type="button" class="btn-octicon disabled tooltipped tooltipped-nw"
aria-label="You must be signed in to make or propose changes">
<svg aria-hidden="true" class="octicon octicon-pencil" height="16" version="1.1" viewBox="0 0 14 16" width="14"><path fill-rule="evenodd" d="M0 12v3h3l8-8-3-3-8 8zm3 2H1v-2h1v1h1v1zm10.3-9.3L12 6 9 3l1.3-1.3a.996.996 0 0 1 1.41 0l1.59 1.59c.39.39.39 1.02 0 1.41z"/></svg>
</button>
<button type="button" class="btn-octicon btn-octicon-danger disabled tooltipped tooltipped-nw"
aria-label="You must be signed in to make or propose changes">
<svg aria-hidden="true" class="octicon octicon-trashcan" height="16" version="1.1" viewBox="0 0 12 16" width="12"><path fill-rule="evenodd" d="M11 2H9c0-.55-.45-1-1-1H5c-.55 0-1 .45-1 1H2c-.55 0-1 .45-1 1v1c0 .55.45 1 1 1v9c0 .55.45 1 1 1h7c.55 0 1-.45 1-1V5c.55 0 1-.45 1-1V3c0-.55-.45-1-1-1zm-1 12H3V5h1v8h1V5h1v8h1V5h1v8h1V5h1v9zm1-10H2V3h9v1z"/></svg>
</button>
ECE 408 Project
The goal of this project is to accelerate the forward propagation step of the Convolutional Neural Network (CNN) algorithm using GPUs. The sequential implementation provided follows the basic algorithm 16.4 and 16.5 decribed in book chapter 16. The dataset and model are from the MNIST database.
CNN and MNIST
Read the book chapter and familiarize youself with the CNN algorithm.
Provided is a model that has been trained using 60,000 examples (training set images) and the provided test data is 10,000 batched queries (test set images). The expected accuracy of the CNN is ~87%
on the provided test dataset.
The data and model are in HDF5 format and we have provided the code to read the input model and the training dataset.
CUDA Implementation
Book chapters 16.3 and 16.4 provide a basic CUDA implementation of forward propagation of convolutional layer and possible optimization. Your CUDA implementation would be evaluated based on performance and accuracy. Apply any optimization you think would bring benefit and feel free to modify any part of the code. You should not use cuBLAS
or cuDNN
for the implementation, but you are expected to compare your implementation with those libraries --- profiling the code as well as comparing the algorithms used (if algorithm information is publically available).
Remote Development Environment
The easiest way to develop the project is to use rai through the following prebuilt binaries. The stable version only supports Linux and OSX. For students with Windows, you can try the beta version or use the Linux on EWS for RAI.
NOTE: Even if you use your local development environment, your final code must run within the RAI system. Also, your final report performance measurements must be done within RAI.
Download Binaries
The code is continuously built and published. The client can be downloaded from the following URLs (depending on your OS and Architecture):
Operating System | Architecture | Stable Version Link | Development Version Link |
---|---|---|---|
Linux | i386 | URL | URL |
Linux | amd64 | URL | URL |
Linux | armv5 | URL | URL |
Linux | armv6 | URL | URL |
Linux | armv7 | URL | URL |
Linux | arm64 | URL | URL |
OSX/Darwin | i386 | URL | URL |
OSX/Darwin | amd64 | URL | URL |
Windows | i386 | URL | URL |
Windows | amd64 | URL | URL |
Client
Set up your Secret Key
Each team will be contacted by a TA and given a secret key to use this service. Do not share your key with other teams. The secret key is used to authenticate you with the server.
The RAI_SECRET_KEY
, RAI_TEAM_NAME
, and RAI_ACCESS_KEY
should be specified in your ~/.rai.profile
(linux/OSX) or %HOME%/.rai.profile
(Windows -- for me this is C:\Users\abduld\.rai.profile
) in the following way.
RAI_TEAM_NAME='Your Team Name Here' RAI_USER_NAME='user' RAI_ACCESS_KEY='XXXXXXXX' RAI_SECRET_KEY='XXXXX'
The above will need to match the email you recieved from postmaster@webgpu.com
on Nov 23. If you did not recieve the email, then contact the TA. Also, contact the TA with your team name as soon as possible. Do not share your keys with other users or teams. The access and secret key is used to authenticate you with the server. Both the team name and the username are used to identify you to the system.
Run the Client
To run the client, use
rai -d <project folder>
From a user's point a view when the client runs, the local directory specified by -d
gets uploaded to the server and extracted into the /src
directory on the server. The server then executes the build commands from the rai-build.yml
specification within the /build
directory. Once the commands have been run, or there is an error, a zipped version of that /build
directory is available from the server for download.
The server limits the task time to be an hour with a maximum of 8GB of memory being used within a session. The output /build
directory is only available to be downloaded from the server for a short amount of time. Networking is also disabled on the execution server.
Internal Details (Ignore if not Interested)
The client sends job submission requests to the rai server. The internal steps the client takes are as follows:
- The client creates an archive of your directory and posts it to Amazon S3
- The client creates a unique identifier (here called
ID
). These IDs are generated usingNewObjectId
. - The client creates a job request and publishes to the
tasks
topic on the queue. The job request has the ID field with the valueID
and is mashaled using using thebson
library. The reason for usingbson
is that we will want to store the results in mongodb in the future. - The client subscribes to the topic
log-ID
and prints the results on that topic. - The client stops listening when the message on the topic has a tag
TagEnd
.
Project Build Sepecification
The rai-build.yml
must exist in your project directory. If not available, then the system will use the default build script. In some cases you may not be able to execute certain commands, in this senario the current workaround is to create a bash file and insert the commands you need to run. You can then execute the bash script within rai-build.yml
.
The rai-build.yml
is written as a Yaml (Spec) file and has the following structure.
rai: version: 0.1 # this is required image: webgpu/rai:root # this is ignored at this moment with the webgpu/rai:root # image being used by default. webgpu/rai:root is a docker # image which can be viewed at https://hub.docker.com/r/webgpu/rai/ resources: gpus: 1 # currently this field is ignored, but in the future you'd be able to specify your # system requirements commands: build: - echo "Building project" # Since the system already contains the dependencies (like HDF5 and ZLib) we do not # need the hunter package manager. This speeds up the compilation as well - cmake -DCONFIG_USE_HUNTER=OFF /src # Run the make file to compile the project. - make # here we break the long command into multiple lines. The Yaml # format supports this using a block-strip command. See # http://stackoverflow.com/a/21699210/3543720 for info - >- nvprof --analysis-metrics --print-api-trace --cpu-profiling on --demangling on --export-profile profile.nvvp --force-overwrite --log-file run.log --print-gpu-trace -- ./ece408 /src/data/test10.hdf5 /src/data/model.hdf5 10
Syntax errors will be reported and the job will not be executed. You can check if your file is in a valid yaml format by using tools such as Yaml Validator.
Profiling
Profiling can be performed using nvprof
. Place the following build commands in your rai-build.yml
file
- >- nvprof --cpu-profiling on --export-profile timeline.nvprof -- ./ece408 /src/data/test10.hdf5 /src/data/model.hdf5 10 - >- nvprof --cpu-profiling on --export-profile analysis.nvprof --analysis-metrics -- ./ece408 /src/data/test10.hdf5 /src/data/model.hdf5 10
You could change the input and test datasets. This will output two files timeline.nvprof
and analysis.nvprof
which can be viewed using the nvvp
tool (by performing a file>import
).
NOTE: nvvp
will only show performance metrics for GPU invocations, so it may not show any analysis when you only have serial code.
Project Submission
You will use the same client (with certain options) for the final submission. The submission system notify the teaching assistants and record your ranking. You will need the above credentials to make your final submission.
To submit your project, run
rai submit -d <project folder>
To perform the final project submission, you must have the USAGE
, README
, and report.pdf
files in your project folder (as stated in the "What to Deliver" section). The submission system ignores your rai-build.yml
file and instead runs the following build file:
rai: version: 0.1 resources: gpus: 1 commands: build: - echo "Submitting project" - cp -r /src /build/submission_code - cmake -DCONFIG_USE_HUNTER=OFF /src - make - /usr/bin/time ./ece408 /src/data/testfull.hdf5 /src/data/model.hdf5 10000
NOTE:: Only your last submission is recorded, so please make sure that your last submission is the one you'd want to be graded.
Competition Rankings
You can see the current rankings for the project competition by invoking
rai rankings
You can see only the top 10 teams by invoking
rai rankings -l 10
Reporting Issues
If emailing the TA with a problem, then please include the output of
rai version
as well as the output of
rai buildtime
you can also invoke the rai command with verbose and debug outputs using
rai --verbose --debug
Local Development Environment
NOTE: Even if you use your local development environment, your final code must run within the RAI system. Also, your final report performance measurements must be done within RAI.
The project requires a CUDA-supported operating system, C compiler, and the CUDA 8 Toolkit. The CUDA 8 Toolkit can be downloaded from the CUDA Download page. Instructions on how to install the CUDA Toolkit are available in the Quick Start page. Installation guides and the list of supported C compilers for Windows, Linux, and OSX are also found in the CUDA Toolkit Documentation Page.
Aside from a C compiler and the CUDA 8 Toolkit, CMake 3.1 or later is required to generate build scripts for your target IDE and compiler. On windows, we require Visual Studio 2015 (Service Pack 3) which you can download from the webstore. For other systems, a CUDA compatible compiler is required (e.g. for OSX the clang compiler is the only one supported).
How to Build
There are two options to build this project, the first is using the Hunter package manager and the other is using Docker. We sugguest using CMake along with Hunter, but it is known not to work on all operating systems. In this case, we suggest that you either using Docker or install the libraries needed (mainly HDF5
).
Using Hunter Package Manager
By default, the compilation uses the Hunter --- a C package manager. This method requires that you have the CUDA toolkit installed on your machine.
Assuming that you have checked out the project into $SRCDIR
do
cd $SRCDIR mkdir build cd build cmake $SRCDIR
This will download the required software needed for the project (see the hunter docs for more information). You may see some warning while the system is compiling HDF5, which you can ignore. Once CMake has been run, a Makefile
is generated so you can then perform make
to buidl the project.
make
If you do not plan on using make
, examine the cmake -G
option which allows you to generate XCode, Visual Studio, ... project configurations. You may also need to change the build type to enable/disable debugging and/or optimizations.
If you need to use another library, you need have to modify the CMakeLists.txt
and add the libraries to the target_link_libraries
(and possibly the include_directories
) section. Documentation on the CMake commands is found in the documentation page.
Using Docker Container
Also included is a Docker build file. This file is a specification for a Docker container image. It can be used to build and launch a container (think of a virtual machine) which contains this project along with all the software required to run it. Using a GPU within Docker is only supported on Linux(you can compile and run the serial code on any operating system), and we recommend using NVIDIA-Docker to run the Docker image. To build the Docker container, do
cd $SRCDIR docker build . -t ece408project
Once built, the ece408project
image would be listed by the docker images
command. This will compile your project. You can launch the docker image using
docker run -it ece408project
Running the Serial Code
./ece408 ../data/test10.hdf5 ../data/model.hdf5 batch_size
the batch_size
must match the size of the dataset. If batch_size
is unspecified, the default value is dependent on the input (10 for "../data/test10.hdf5", ..., 10000 for "../data/testfull.hdf5"), which is also the size of data.hdf5
.
How to Test
Test your implementation with small batch size frist to verify the correctness. You can parse the data/test100.hdf5
into smaller chunks using your preferred language(e.g. python). 2, 10 and 100 queries are provides in data/test2.hdf5
, data/test10.hdf5
and data/test100.hdf5
in the data folder. Maker sure the data file you feed in has the same batch size as the batch_size
you specify in the command line.
./ece408 ../data/test10.hdf5 ../data/model.hdf5 10
What to Deliver
A .tar.gz
file which contains the report, code directory, the build scripts, and, possibly, the input dataset needs to be delivered to the Teaching Assistants.
- Code: A
USAGE
file needs to be placed included in the archive file which includes instructions on how to compile and run your code. If the report performs any profiling, theUSAGE
file must also specify how to run the performance measurements. - Report: A PDF version report must be included within the
.tar.gz
file. The report should describe and evaluate the optimizations you tried. The report does not have a page limit, but as usual, you should strive to be thorough, concise, and quantitative in your performance analysis. The report must be namedreport.pdf
Make sure you have a working CUDA implementation before applying any optimizations.
Optimization Opportunities
The serial version of the code is amicable to many optimization opportunities, the following is an incomplete set of them:
- Optimize the CUDA memory copies to decrease the overhead of memory transfers
- Overlapping the memory transfer and the compute and/or independent computations using CUDA streams
- Performing layout transformations to get coallessed accesses or to make better use of the cache
- Using low precision to perform the computation (for example using
float16
or binary values) - Based on the size of the convolution, utilitize better algorithms to perform the computation (for example using the [Winograd Kernel][https://www.nervanasys.com/winograd-2/])
Utility Functions
We provide a some helper utility functions in the utils.hpp
file.
How to Time
In utils.hpp
a function called now()
which allows you to get the current time at a high resolution. To measure the overhead of a function f(args...)
, the pattern to use is:
const auto tic = now(); f(args...); const auto toc = now(); const auto elapsed = std::chrono::duration<double, std::milli>(toc - tic).count();; std::cout << "Calling f(args...) took " << elapsed << "milliseconds\n";
Range For Loops
Throughout the serial code, we use the range.hpp
to make the code easier to understand. Essentially,
for (const auto ii : range(0, N)) { do_stuff(ii); }
Is equivalent to
for (const auto ii = 0; ii < N; ii++) { do_stuff(ii); }
Checking Errors
To check for CUDA errors, specialize the check_success
function in utils.hpp
to also handle cudaError_t
. For example:
template <> bool check_success<cudaError_t>(const cudaError_t &err) { const auto res = err == cudaSuccess; if (res == true) { return res; } std::cout << "Failed in CUDA. Error = " << cudaGetErrorString(err) << std::endl; assert(res); return res; }
check_success
can then be used when calling CUDA functions:
check_success(cudaFree(deviceData));
Reporting Issues
Please use the Github issue manager to report any issues or suggestions about the project.
Jump to Line
</div>
</div>
<div class="container site-footer-container">
</ul>
<a href="https://github.com" aria-label="Homepage" class="site-footer-mark" title="GitHub">
<svg aria-hidden="true" class="octicon octicon-mark-github" height="24" version="1.1" viewBox="0 0 16 16" width="24"><path fill-rule="evenodd" d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.013 8.013 0 0 0 16 8c0-4.42-3.58-8-8-8z"/></svg>
<div id="ajax-error-message" class="ajax-error-message flash flash-error">
<svg aria-hidden="true" class="octicon octicon-alert" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M8.865 1.52c-.18-.31-.51-.5-.87-.5s-.69.19-.87.5L.275 13.5c-.18.31-.18.69 0 1 .19.31.52.5.87.5h13.7c.36 0 .69-.19.86-.5.17-.31.18-.69.01-1L8.865 1.52zM8.995 13h-2v-2h2v2zm0-3h-2V6h2v4z"/></svg>
<button type="button" class="flash-close js-flash-close js-ajax-error-dismiss" aria-label="Dismiss error">
<svg aria-hidden="true" class="octicon octicon-x" height="16" version="1.1" viewBox="0 0 12 16" width="12"><path fill-rule="evenodd" d="M7.48 8l3.75 3.75-1.48 1.48L6 9.48l-3.75 3.75-1.48-1.48L4.52 8 .77 4.25l1.48-1.48L6 6.52l3.75-3.75 1.48 1.48z"/></svg>
</button>
You can't perform that action at this time.
</div>
<script crossorigin="anonymous" integrity="sha256-BciiVpjF80z81NbTK5C+pYJJy0AR3716p9iUKqiUi6A=" src="https://assets-cdn.github.com/assets/frameworks-05c8a25698c5f34cfcd4d6d32b90bea58249cb4011dfbd7aa7d8942aa8948ba0.js"></script>
<script async="async" crossorigin="anonymous" integrity="sha256-ui8c6b8Kx6xyTL16Zl4BliNXtzMyLWgscQqAIibkRaE=" src="https://assets-cdn.github.com/assets/github-ba2f1ce9bf0ac7ac724cbd7a665e01962357b733322d682c710a802226e445a1.js"></script>
<div class="js-stale-session-flash stale-session-flash flash flash-warn flash-banner d-none">
<svg aria-hidden="true" class="octicon octicon-alert" height="16" version="1.1" viewBox="0 0 16 16" width="16"><path fill-rule="evenodd" d="M8.865 1.52c-.18-.31-.51-.5-.87-.5s-.69.19-.87.5L.275 13.5c-.18.31-.18.69 0 1 .19.31.52.5.87.5h13.7c.36 0 .69-.19.86-.5.17-.31.18-.69.01-1L8.865 1.52zM8.995 13h-2v-2h2v2zm0-3h-2V6h2v4z"/></svg>
<span class="signed-in-tab-flash">You signed in with another tab or window. <a href="">Reload</a> to refresh your session.</span>
<span class="signed-out-tab-flash">You signed out in another tab or window. <a href="">Reload</a> to refresh your session.</span>
</div>
<div class="facebox" id="facebox" style="display:none;">