diff --git a/README.md b/README.md
index a947d3f83edc5862edf6eb5b045b63aa3817d153..27ad0e045b85bf32de78dc8a93f03456bb4d77de 100644
--- a/README.md
+++ b/README.md
@@ -27,8 +27,8 @@ The following components are required to be installed on your machine to build H
 
 Checkout HPVM:
 ```shell
-git clone https://gitlab.engr.illinois.edu/llvm/hpvm.git
-git checkout hpvm-release
+git clone https://gitlab.engr.illinois.edu/llvm/hpvm-release.git/
+cd hpvm-release
 ```
 
 HPVM installer script can be used to download, configure and build HPVM along with LLVM and other subprojects including Clang. 
@@ -66,10 +66,10 @@ make install
 With all the aforementioned steps, HPVM should be built, installed and ready for use!
 
 ## Benchmarks and Tests
-We are providing the following benchmarks with HPVM:
-* Select benchmarks from the Parboil benchmark suite, located under [test/parboil](/hpvm/test/parboil).
+We are providing the following HPVM benchmarks:
+* Select benchmarks from the [Parboil](http://impact.crhc.illinois.edu/parboil/parboil.aspx) benchmark suite, located under [test/parboil](/hpvm/test/parboil).
 * An edge detection pipeline benchmark, located under [test/pipeline](/hpvm/test/pipeline).
-* A Camera ISP pipeline, curtosy of our collaborators at [Harvard](http://vlsiarch.eecs.harvard.edu), located under [test/cava](/hpvm/test/pipeline).
+* A Camera ISP pipeline, located under [test/cava](/hpvm/test/pipeline), adapted from C code provided from our collaborators at [Harvard](http://vlsiarch.eecs.harvard.edu).
 
-We are also providing [unit tests](/hpvm/test/unitTests) and [regression tests](/hpm/test/regressionTests).
+We are also providing [unit tests](/hpvm/test/unitTests) and [regression tests](/hpvm/test/regressionTests).
 
diff --git a/hpvm/docs/hpvm-c.md b/hpvm/docs/hpvm-c.md
index 2d377016e4a7e463b27cf1dfa9573d2622ac856d..77bc684b16eb6462d7d61cffbc50f258b454b1f6 100644
--- a/hpvm/docs/hpvm-c.md
+++ b/hpvm/docs/hpvm-c.md
@@ -8,9 +8,6 @@ Used before all other HPVM calls to initialize the HPVM runtime.
 ```void __hpvm__cleanup()```  
 Used at the end of HPVM program to clean up all remaining runtime-created HPVM objects.
 
-```void __hpvm__cleanup()```  
-Used at the end of HPVM program to clean up all remaining runtime-created HPVM objects.
-
 ```void llvm_hpvm_track_mem(void* ptr, size_t sz)```  
 Insert memory starting at ```ptr``` of size ```sz``` in the memory tracker of HPVM runtime.
 
@@ -30,7 +27,7 @@ Waits for completion of execution of the dataflow graph with handle ```G```.
 Push set of input data items, ```args```, (same as type included in launch) to streaming DFG with handle ```G```.
 
 ```void* __hpvm__pop(void* G)```  
-Pop and return data produced from one execution of streaming DFG with handle ```G```.
+Pop and return data produced from one execution of streaming DFG with handle ```G```. The return type is a struct containing a field for every output of DFG. 
 
 ## Internal Node API
 
@@ -48,7 +45,7 @@ Binds the output ```op``` of the current node to output ```oc``` of child node f
 
 ```void __hpvm__hint(enum Target target)``` (C\)  
 ```void __hpvm__hint(hpvm::Target target)``` (C++)  
-Must be called once in each node function. Indicates which hardware target the current function should run in
+Must be called once in each node function. Indicates which hardware target the current function should run in.
 
 ```void __hpvm__attributes(unsigned ni, â€¦, unsigned no, â€¦)```  
 Must be called once at the beginning of each node function. Defines the properties of the pointer arguments to the current function. ```ni``` represents the number of input arguments, and ```no``` the number of output arguments. The arguments following ```ni``` are the input arguments, and the arguments following ```no``` are the output arguments. Arguments can be marked as both input and output. All pointer arguments must be included.
@@ -76,38 +73,32 @@ Returns the dynamic ID of the current instance of node ```N``` in the x, y, or z
 ```long __hpvm__getNumNodeInstances_{x,y,z}(void* N)```  
 Returns the number of dynamic instances of node ```N``` in the x, y, or z dimension respectively. The dimension must be one of the dimensions in which the node is replicated.
 
-```void __hpvm__barrier()```  
-Local synchronization barrier across dynamic instances of current leaf node.
-
 ```void* __hpvm__malloc(long nBytes)```  
-Allocate a block of memory of size ```nBytes``` and returns a pointer to it. The allocated object can be shared by all nodes, although the pointer returned must somehow be communicated explicitly for use by other nodes.
+Allocate a block of memory of size ```nBytes``` and returns a pointer to it. The allocated object can be shared by all nodes. *Note that the returned pointer must somehow be communicated explicitly for use by other nodes.*
 
 ```int __hpvm__atomic_add(int* m, int v)```  
-Atomically adds ```v``` to the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically adds ```v``` to the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```int __hpvm__atomic_sub(int* m, int v)```  
-Atomically subtracts ```v``` from the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
-
-```int __hpvm__atomic_xchg(int* m, int v)```  
-Atomically swaps ```v``` with the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
-
-```int __hpvm__atomic_inc(int* m)```  
-Atomically increments the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
-
-```int __hpvm__atomic_dec(int* m)```  
-Atomically decrements the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically subtracts ```v``` from the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```int __hpvm__atomic_min(int* m, int v)```  
-Atomically computes the min of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically computes the min of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```int __hpvm__atomic_max(int* m, int v)```  
-Atomically computes the max of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically computes the max of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
+
+```int __hpvm__atomic_xchg(int* m, int v)```  
+Atomically swaps ```v``` with the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```int __hpvm__atomic_and(int* m, int v)```  
-Atomically computes the bitwise AND of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically computes the bitwise AND of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```int __hpvm__atomic_or(int* m, int v)```  
-Atomically computes the bitwise OR of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically computes the bitwise OR of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```int __hpvm__atomic_xor(int* m, int v)```  
-Atomically computes the bitwise XOR of ```v``` and the value stored at memory location ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically computes the bitwise XOR of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
+
+```void __hpvm__barrier()```  
+Local synchronization barrier across dynamic instances of current leaf node.
diff --git a/hpvm/docs/hpvm-specification.md b/hpvm/docs/hpvm-specification.md
index 157e57452aedd94d2b06b56958d44c07ede17fed..cd61d95b4e3d4f4068a985bd5f3bac4578f6e14d 100644
--- a/hpvm/docs/hpvm-specification.md
+++ b/hpvm/docs/hpvm-specification.md
@@ -12,13 +12,13 @@ Internal nodes only create the structure of the child graph, and cannot include
 
 Leaf nodes contain code expressing actual computations. Leaf nodes may contain instructions to query the structure of the underlying DFG, and any non host side HPVM operation for synchronization and memory allocation.
 
-Note that the graph is full interpreted at compile-time and  cannot be modified at runtime except for the number of dynamic instances, which can be data dependent.
+Note that the graph is fully interpreted at compile-time and  cannot be modified at runtime except for the number of dynamic instances, which can be data dependent.
 
 
 ## Dataflow Edge
 A *dataflow edge* from the output ```out``` of a source dataflow node ```Src``` to the input ```in``` of a sink dataflow node ```Dst``` describes the explicit data transfer requirements. ```Src``` and ```Dst``` node must belong to the same child graph, i.e. must be children of the same internal node.
 
-An edge from source to sink has the semantics of copying the specified data from the source to the sink after the source node has completed execution. The pairs ```(Src, out)```, and ```(Dst, in)``` must be unique w.r.t. every other edge in the same child graph, i.e. two dataflow edges in the same child graph cannot have the same source or destination.
+An edge from source to sink has the semantics of copying the specified data from the source to the sink after the source node has completed execution. The pairs ```(Src, out)``` and ```(Dst, in)```, representing source and sink respectively, must be unique w.r.t. every other edge in the same child graph, i.e. two dataflow edges in the same child graph cannot have the same source or destination.
 
 A static edge also represents multiple dynamic instances of that edge between the dynamic instances of the source and the sink nodes.
 
@@ -48,19 +48,25 @@ Stop tracking specified memory object and remove it from memory tracker.
 - **Request Memory**:
 If the specified memory object is not present in host memory, copy it to host memory.
 - **Launch**:
-The host code initiates execution of specified DFG, either streaming or non streaming, and provides initial data. All data for one graph execution must be provided.
-- **Wait**:
-The host code blocks for completion of specified DFG.
+The host code initiates execution of specified DFG, either streaming or non streaming.
+    - Non streaming DFG: The host provides all data items required for execution of the DFG at the time of the launch.
+    - Streaming DFG: No data is provided by the launch operation. Streaming execution is sustained by push and pop operations, described below.
 - **Push**:
-Push a set of data required for one graph execution to the specified DFG. The DFG must have been launched using a streaming launch operation. This is a blocking operation.
+Push a set of data items required for one graph execution to the specified DFG. The DFG must have been launched using a streaming launch operation. This is a blocking operation.
 - **Pop**:
 Read data produced from one execution of the specified DFG. The DFG must have been launched using a streaming launch operation. This is a blocking operation.
+- **Wait**:
+The host code blocks for completion of specified DFG.
+    - For a non-streaming DFG, the data produced by the DFG are ready to be read by the host.
+    - For a streaming DFG, no more data may be provided for processing by the DFG.
 
 # HPVM Implementation
 
 This section describes the implementation of HPVM on top of LLVM IR.
 
-We use intrinsic functions to implement the HPVM IR. iN is the N-bit integer type in LLVM.
+iN is the N-bit integer type in LLVM.
+
+We use intrinsic functions to implement the HPVM IR.
 
 The code for each dataflow node is given as a separate LLVM function, called the node function. The node function may call additional, auxiliary functions. However, the auxiliary functions are not allowed to include any HPVM intrinsics, as they are not considered to be part of the HPVM node hierarchy.
 
@@ -123,7 +129,7 @@ The following intrinsics are used for memory allocation and synchronization. The
 
 ```i8* llvm.hpvm.malloc(i64 nBytes)```  
 Allocate a block of memory of size ```nBytes``` and return pointer to it. The allocated object can be shared by all nodes.  
-*Note that the pointer returned must somehow be communicated explicitly for use by other nodes.*
+*Note that the returned pointer must somehow be communicated explicitly for use by other nodes.*
 
 ```i32 llvm.hpvm.atomic.add(i8* m, i32 v)```  
 Atomically computes the bitwise ADD of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
@@ -144,7 +150,7 @@ Atomically computes the bitwise XCHG of ```v``` and the value stored at memory l
 Atomically computes the bitwise AND of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```i32 llvm.hpvm.atomic.or(i8* m, i32 v)```  
-Atomically computes the bitwise XOR of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
+Atomically computes the bitwise OR of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
 
 ```i32 llvm.hpvm.atomic.xor(i8* m, i32 v)```  
 Atomically computes the bitwise XOR of ```v``` and the value stored at memory location ```[m]``` w.r.t. the dynamic instances of the current leaf node and stores the result back into ```[m]```. Returns the value previously stored at ```[m]```.
@@ -185,6 +191,8 @@ Pop and return data from streaming DFG with handle ```GraphID```. The return typ
 
 ## Implementation Limitations
 Due to limitations of our current prototype implementation, the following restrictions are imposed:
+
 - In HPVM, a memory object is represented as a (pointer, size) pair that includes the address of memory object, and the size (in bytes) of the pointed-to object. Therefore, when an edge/bind carries a pointer, it must be followed by an i64 size value.           
 - Pointers cannot be transferred between nodes using dataflow edges. Instead, they should be passed using the bind operation from the (common) parent of the source and sink nodes.
+
 - Instantiation of dataflow nodes is supported in up to three dimensions.
diff --git a/hpvm/projects/hpvm-rt/CMakeLists.txt b/hpvm/projects/hpvm-rt/CMakeLists.txt
index be7f69c4bfa7623c093bd5e913af1de3dbcf951c..5d75c6e0f12622ac11957fdb29d5691eeb979017 100644
--- a/hpvm/projects/hpvm-rt/CMakeLists.txt
+++ b/hpvm/projects/hpvm-rt/CMakeLists.txt
@@ -2,17 +2,18 @@ add_definitions(-DNUM_CORES=8)
 
 SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/bin/clang)
 SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/bin/clang++)
+SET(CMAKE_CXX_STANDARD 11)
+# Defines ${OpenCL_INCLUDE_DIRS} and ${OpenCL_LIBRARY} if found
+find_package(OpenCL REQUIRED)
 
 add_llvm_library(hpvm-rt.ll hpvm-rt.cpp
-
   DEPENDS
   clang
   llvm-dis
-  )
-
-
-target_compile_options(hpvm-rt.ll PUBLIC -flto )
-target_compile_options(hpvm-rt.ll PUBLIC -std=c++11)
+)
+target_compile_options(hpvm-rt.ll PUBLIC -flto)
+target_include_directories(hpvm-rt.ll PRIVATE ${OpenCL_INCLUDE_DIRS})
+target_link_directories(hpvm-rt.ll PRIVATE ${OpenCL_LIBRARY})
 
 add_custom_target(hpvm-rt.cpp.o ALL
   COMMAND ar -x ${CMAKE_BINARY_DIR}/lib/libhpvm-rt.ll.a
diff --git a/hpvm/test/README.md b/hpvm/test/README.md
index 0032268c55b2234118a3ec9243a8191147eb1e1f..59832c64452d1103d8ff0cd7148a9379611b810f 100644
--- a/hpvm/test/README.md
+++ b/hpvm/test/README.md
@@ -1,5 +1,7 @@
 # Using HPVM
-Tests are provided, along with a template Makefile for user projects.
+The below benchmarks are provided with HPVM, along with a template Makefile for user projects. In order to be able to build the existing benchmarks, a new `Makefile.config` must be created in [include](/hpvm/test/include) based on the existing `Makefile.config.example`. This configuration file must set up the following paths:
+* LLVM_BUILD_DIR: should point to your local `build` directory of HPVM.
+* CUDA_PATH: should point to your local CUDA installation.
 
 ## Parboil
 Several tests from the [parboil suite](http://impact.crhc.illinois.edu/parboil/parboil.aspx) have been ported to HPVM.