diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
index 947556489de2fb94ef7d793dd46eb74db2240516..83781b148c4bb41619bbbb54d9e69cc9fc7f2543 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h
@@ -1,10 +1,15 @@
+/*
+device_math.h
+Provides pointer to CUDA math function and other properties ofa math operator
+(one among MathOp) on a certain scalar type.
+*/
 #ifndef DEVICE_MATH_H
 #define DEVICE_MATH_H
 
-#include <device_launch_parameters.h>
 #include <cuda_fp16.h>
-#include <stdexcept>
+#include <device_launch_parameters.h>
 #include <limits>
+#include <stdexcept>
 
 #include "debug.h"
 
@@ -24,12 +29,18 @@ enum class MathOp {
   PSNR
 };
 
+// Find the CUDA function for math operator `op`.
+// This is ONLY defined (through template specialization, in device_math.cu) for
+// float and half (see below).
 template <typename T> void *mathOpToFunc(MathOp op);
 
 template <> void *mathOpToFunc<float>(MathOp op);
 
 template <> void *mathOpToFunc<half>(MathOp op);
 
+// Returns the identity element of math operator `op`, for example, -inf for
+// MAX, 0 for ADD.
+// Specialization exists for half type.
 template <typename T> T reduceOpToIdentity(MathOp op) {
   switch (op) {
   case MathOp::Hypot:
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h
index bc85495e209eb8dcd53d8d5b1cc52bc3c4ddcf4e..71099a89e4ff1c47a14c4652556838e55c3850ea 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h
@@ -1,11 +1,20 @@
+/*
+broadcast.h
+Calculates shape of two tensors broadcasted together, using a numpy-like (but
+weaker) rule.
+*/
+
+#ifndef FUNCTIONAL_BROADCAST_H
+#define FUNCTIONAL_BROADCAST_H
+
 #include <algorithm>
 #include <array>
 #include <cstddef>
 #include <type_traits>
 
 #include "common.h"
-#include "tensor.h"
 #include "debug.h"
+#include "tensor.h"
 
 // TODO: don't accept N == 1
 template <size_t N, typename std::enable_if<N >= 1, int>::type = 0>
@@ -72,3 +81,5 @@ private:
   std::vector<size_t> out_sizes, sizes[N];
   size_t tail_stride[N];
 };
+
+#endif // FUNCTIONAL_BROADCAST_H
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
index 76a26d249c2ef1dc9db196f80cd99c1054e12603..00326bef03b78d905f5923ae3ab5a79f327c2e7b 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h
@@ -1,5 +1,13 @@
-#ifndef IMAGE_PROCESSING_COMMON_H
-#define IMAGE_PROCESSING_COMMON_H
+/*
+common.h
+Helper functions shared among functional/* header files and their corresponding
+*.cu files.
+These include util functions for CUDA, or functions on __device__, or Tensor
+methods that really should be in `struct Tensor`.
+*/
+
+#ifndef FUNCTIONAL_COMMON_H
+#define FUNCTIONAL_COMMON_H
 
 #include <cuda_fp16.h>
 #include <cudnn.h>
@@ -8,20 +16,15 @@
 #include <vector>
 
 #include "debug.h"
-#include "tensor.h"
 #include "profiling.h"
+#include "tensor.h"
 
+// Return ceil(a / b) for both host and device.
 template <typename T> __host__ __device__ __forceinline__ T ceilDiv(T a, T b) {
   return (a + b - 1) / b;
 }
 
-template <typename T> __host__ T resolve_func_ptr(void *func_symbol_ptr) {
-  void *v_func_ptr = nullptr;
-  checkCudaErrors(cudaMemcpyFromSymbol(
-      &v_func_ptr, *(void **)func_symbol_ptr, sizeof(void *)));
-  return (T)v_func_ptr;
-}
-
+// Profiles float -> half conversion, can be used like a guard.
 template <typename T> class HFProfileGuard {
   static const char *getEventName(bool end) {
     if (typeid(T) == typeid(half) || typeid(T) == typeid(half2))
@@ -31,6 +34,10 @@ template <typename T> class HFProfileGuard {
   }
 
   static bool needProfiling() {
+    // Only profile when given type T is half / half2.
+    // As this guard is often used in templated, scalar-type-agnostic
+    // implementation of an operator, this `T` is often that operator's scalar
+    // type.
     return typeid(T) == typeid(half) || typeid(T) == typeid(half2);
   }
 
@@ -46,6 +53,8 @@ public:
   }
 };
 
+// Convert C++ type (given by template type T) to "tensor datatype", which is a
+// enum that `struct Tensor` recognizes.
 template <typename T> int getTensorType() {
   if (typeid(T) == typeid(float))
     return (int)float_type;
@@ -61,6 +70,11 @@ template <typename T> int getTensorType() {
   }
 }
 
+// Type-cast Tensor `t` to type `T` (regardless of what current type `t` has),
+// and return a pointer to its underlying data on GPU (which can be t->gpu_data
+// or t->gpu_half_data).
+// This is specialized and implemented for float, float2 (float-complex), half,
+// half2 (used for speeding up operations in half type)
 template <typename T> T *convertAndGetGPUData(Tensor *t);
 
 template <> float *convertAndGetGPUData<float>(Tensor *t);
@@ -71,18 +85,35 @@ template <> half *convertAndGetGPUData<half>(Tensor *t);
 
 template <> half2 *convertAndGetGPUData<half2>(Tensor *t);
 
+// Like convertAndGetGPUData, but calls `convertToFP32_offline` instead of
+// `convertToFP32`, which makes a difference when online / offline profiling is
+// involved.
 void convertToFloat2Offline(Tensor *t);
 
+// Return sizes of tensor with a vector.
 std::vector<size_t> sizes(Tensor *t);
 
 std::vector<size_t> sizes(const Dimension &dim);
 
+// Return total number of element in a tensor.
 size_t num_elems(const std::vector<size_t> &dim_sizes);
 
 size_t num_elems(const Dimension &dim);
 
 size_t num_elems(Tensor *t);
 
+// Checks equivalence of types t1 and t2 under the assumption that float=half
+// and float2=half2, and returns the equalized type.
+// 1. Define an equivalence operator (==):
+//  t == t          = True
+//  float == half   = True
+//  float2 == half2 = True
+//  otherwise       = False
+// and throws if t1 != t2.
+// 2. Returns the same type `t`. But as float is not _actually_ the same thing
+// as half, `get_half` determines wh which one to return. E.g. with t1 ==
+// float2, t2 == half, if get_half == true, half2 is returned, otherwise float2
+// is returned.
 Tensor_type_t getCompatibleType(int t1, int t2, bool get_half);
 
-#endif // IMAGE_PROCESSING_COMMON_H
+#endif // FUNCTIONAL_COMMON_H
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
index 525afa9529fc25f9b3d7c3082980a5582771ef67..74568d8183a7a64f48750b4d02a6286224cac817 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh
@@ -1,5 +1,10 @@
-#ifndef RUNTIME_MAP_H
-#define RUNTIME_MAP_H
+/*
+map.cuh
+Implementation of the map operator, including broadcast, the cuda kernel for
+map, and a general map_n function in host code (which calls the kernel).
+*/
+#ifndef FUNCTIONAL_MAP_H
+#define FUNCTIONAL_MAP_H
 
 #include <array>
 #include <cstddef>
@@ -13,17 +18,20 @@
 #include "tensor.h"
 #include "tensor_utils.h"
 
+// Checks dimension and data order of each map argument.
 template <size_t N> void mapPrecheck(const std::array<Tensor *, N> &srcs) {
   for (Tensor *src : srcs) {
     if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW)
-      ERROR("Not supported\n"); // TODO: support this
+      ERROR("Not supported\n");
   }
 }
 
+// CUDA kernel for map_n. This is _actually_ mostly unused as specialization for
+// float / half exists for performance benefit.
 template <typename Scalar, size_t N>
 __global__ void kernelMapBroadcast(
-    Scalar *target, unsigned num_rows,  void *func,
-    Scalar **srcs, size_t *tail_strides) {
+    Scalar *target, unsigned num_rows, void *func, Scalar **srcs,
+    size_t *tail_strides) {
   auto *n_ary_op = (NTo1MapF<Scalar, N>)func;
 
   unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x,
@@ -39,19 +47,23 @@ __global__ void kernelMapBroadcast(
 // Instantiate float to compare fairly to half. Implemented for N = 1...3
 template <size_t N>
 __global__ void kernelMapBroadcast<float, N>(
-    half *target, unsigned num_rows,  void *func,
-    half **srcs, size_t *tail_strides);
+    half *target, unsigned num_rows, void *func, half **srcs,
+    size_t *tail_strides);
 
 // Half uses a different implementation. Implemented for N = 1, 2
 template <size_t N>
 __global__ void kernelMapBroadcast<half, N>(
-    half *target, unsigned num_rows,  void *func,
-    half **srcs, size_t *tail_strides);
+    half *target, unsigned num_rows, void *func, half **srcs,
+    size_t *tail_strides);
 
+// Create parameter for cuda kernel by copying pointers to device (gpu).
+// This function unwraps BroadcastRemap into a cuda array of size N -- one value
+// for the broadcast stride of each map argument, and unwraps `srcs` into their
+// gpu data pointers.
 template <typename Scalar, size_t N>
 std::tuple<size_t *, Scalar **> make_cuda_params(
     const BroadcastRemap<N> &br, const std::array<Tensor *, N> &srcs) {
-  for (Tensor *t: srcs)
+  for (Tensor *t : srcs)
     hostToDeviceCopy(t);
   std::array<Scalar *, N> gpu_datas;
   {
@@ -73,6 +85,8 @@ std::tuple<size_t *, Scalar **> make_cuda_params(
   return std::make_tuple(cuda_strides, cuda_gpu_data);
 }
 
+// Host code for map_n that check and converts the parameters, and calls the
+// cuda kernel.
 template <
     typename Scalar, size_t N, typename std::enable_if<N >= 1, int>::type = 0>
 __host__ Tensor *mapGeneral(MathOp mop, const std::array<Tensor *, N> &srcs) {
@@ -95,8 +109,8 @@ __host__ Tensor *mapGeneral(MathOp mop, const std::array<Tensor *, N> &srcs) {
   unsigned threads = std::min(max_threads, n_elem);
   unsigned grids = std::min(max_grid, ceilDiv(n_elem, threads));
   kernelMapBroadcast<Scalar, N><<<grids, threads>>>(
-      convertAndGetGPUData<Scalar>(target), n_elem, func_ptr,
-      gpu_data, cuda_strides);
+      convertAndGetGPUData<Scalar>(target), n_elem, func_ptr, gpu_data,
+      cuda_strides);
   cudaDeviceSynchronize();
   checkCUDA(cudaGetLastError());
   return target;
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
index 4cb93661384a572425361a053371c409606de164..c6c804fa00f1ae5eb324d6928d8f3c43b1231d14 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h
@@ -1,7 +1,11 @@
-#ifndef IMAGE_PROCESSING_MAP_TYPING_H
-#define IMAGE_PROCESSING_MAP_TYPING_H
+/*
+map_typing.h
+Helper for metaprogramming used by map.cuh.
+Defines some recursively templated types and functions.
+*/
+#ifndef FUNCTIONAL_MAP_TYPING_H
+#define FUNCTIONAL_MAP_TYPING_H
 
-// Constructs type T (*)(T, T, T, T, ... <n_times>) from T and N
 #include <cstddef>
 #include <device_launch_parameters.h>
 #include <tuple>
@@ -17,9 +21,13 @@ struct _RepNType<T, W, std::index_sequence<Is...>> {
   using type = W<Type<T, Is>...>;
 };
 
+// Constructs type W<T, T, T, T, ... (N times)> from T and N
+// RepNType T W N = W (T, T, T ... N times ..., T)
 template <typename T, template <typename...> typename W, size_t N>
 using RepNType = typename _RepNType<T, W, std::make_index_sequence<N>>::type;
 
+// Like std::function<Ret(Args...)> but denotes function raw pointer instead of
+// lambda function
 template <typename Ret, typename... Args> using FuncPtrT = Ret (*)(Args...);
 
 template <typename Ret, typename Arg, size_t N> struct _NAToBFunc {
@@ -29,11 +37,15 @@ template <typename Ret, typename Arg, size_t N> struct _NAToBFunc {
 };
 } // namespace
 
+// NAToBF Ret Arg N = Ret(*)(Arg, Arg, ...N times)
 template <typename Ret, typename Arg, size_t N>
 using NAToBF = typename _NAToBFunc<Ret, Arg, N>::type;
 
+// NTo1MapF Arg N = Arg(*)(Arg, Arg, ...N times)
+// This denotes n-to-1 map: Arg x Arg x Arg x ... -> Arg.
 template <typename Scalar, size_t N> using NTo1MapF = NAToBF<Scalar, Scalar, N>;
 
+// RepNTuple T N = std::tuple<Arg, Arg, ...N times>
 template <typename T, size_t N> using RepNTuple = RepNType<T, std::tuple, N>;
 
 namespace {
@@ -49,20 +61,25 @@ __device__ auto call(Function f, Tuple t, std::index_sequence<I...>) {
 }
 } // namespace
 
+// Converts Iterable of type T and length N to (same-typed) tuple
+// std::tuple<T, T, T, T, ...>
 template <typename TIterable, typename T, size_t N>
 constexpr RepNTuple<T, N> as_tuple(TIterable arr) {
   return as_tuple<TIterable, T>(arr, std::make_index_sequence<N>{});
 }
 
+// Expands Tuple t into parameters of Function f, in python this would be
+// f(*t).
 template <typename Function, typename Tuple>
 __device__ auto call_on_tuple(Function f, Tuple t) {
   static constexpr auto size = std::tuple_size<Tuple>::value;
   return call(f, t, std::make_index_sequence<size>{});
 }
 
+// Expands Array of type T and size N into parameters of Function 
 template <typename Ret, typename T, size_t N>
 __device__ Ret call_on_c_array(NAToBF<Ret, T, N> f, const T arr[N]) {
   return call_on_tuple(f, as_tuple<const T *, T, N>(arr));
 }
 
-#endif // IMAGE_PROCESSING_MAP_TYPING_H
+#endif // FUNCTIONAL_MAP_TYPING_H
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
index de5402942cf7be965a342ebfedf80d21475c49d3..9f4fabfb5e0b75017e901c2cb4c60d8649b04f07 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh
@@ -1,3 +1,7 @@
+/*
+reduce.cuh
+Implementation for reduce operator.
+*/
 #include <device_launch_parameters.h>
 #include <functional>
 #include <numeric>
@@ -76,7 +80,8 @@ __global__ void kernelReduceDimSeq(
     for (size_t irow = start_irow; irow < num_irows; irow += irow_stride) {
       K *src = src_ + orow * row_size * num_irows + irow;
       K *target = target_ + orow * num_irows + irow;
-      reduceAlongDim(target, src, init, identity, func, num_irows, approx_row_size);
+      reduceAlongDim(
+          target, src, init, identity, func, num_irows, approx_row_size);
     }
   }
 }
@@ -101,16 +106,17 @@ __global__ void __launch_bounds__(NThreads) kernelReduceDimParallel(
       K *src = src_ + orow * row_size * num_irows + irow;
       K *target = target_ + orow * num_irows + irow;
       parallelReduceAlongDim(
-          target, src, init, identity, func, num_irows, approx_row_size, threadIdx.x,
-          blockDim.x);
+          target, src, init, identity, func, num_irows, approx_row_size,
+          threadIdx.x, blockDim.x);
     }
   }
 }
 
+/* Entry point for `reduce` implementation. Calls the right version of reduction
+ * kernel as needed. */
 template <typename Scalar>
 __host__ Tensor *reduceDim(
-    Tensor *src, const Scalar &init, MathOp op, size_t axis,
-    float skip_rate) {
+    Tensor *src, const Scalar &init, MathOp op, size_t axis, float skip_rate) {
   // Copy input over
   hostToDeviceCopy(src);
 
@@ -156,8 +162,8 @@ __host__ Tensor *reduceDim(
     dim3 threads(AlongDimTh, CrossDimTh);
     dim3 grid(grid_x, grid_y);
     kernelReduceDimParallel<Scalar><<<grid, threads>>>(
-        convertAndGetGPUData<Scalar>(target), src_data,
-        init, identity, func, num_irows, num_orows, row_size, approx_row_size);
+        convertAndGetGPUData<Scalar>(target), src_data, init, identity, func,
+        num_irows, num_orows, row_size, approx_row_size);
   } else {
     DEBUG(
         "Reducing sequentially, row size = %lu, actually using %lu\n", row_size,
@@ -169,8 +175,8 @@ __host__ Tensor *reduceDim(
         std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x));
     dim3 grid(grid_x, grid_y);
     kernelReduceDimSeq<Scalar><<<grid, threads>>>(
-        convertAndGetGPUData<Scalar>(target), src_data,
-        init, identity, func, num_irows, num_orows, row_size, approx_row_size);
+        convertAndGetGPUData<Scalar>(target), src_data, init, identity, func,
+        num_irows, num_orows, row_size, approx_row_size);
   }
   cudaDeviceSynchronize();
   checkCUDA(cudaGetLastError());
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
index 708c52917a3fc2e4eb92da7526350346f85ca459..608107e1dfb39bb268899227dc21f45d969de1f7 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h
@@ -8,8 +8,10 @@
 // ***                        Runtime declaration                        *** //
 void *tensorFft(void *input, bool inverse);
 void *tensorFftHalf(void *input, bool inverse);
-void *tensorReduce(void *input, size_t axis, MathOp func, float skip_ratio = 0.0f);
-void *tensorReduceHalf(void *input, size_t axis, MathOp func, float skip_ratio = 0.0f);
+void *
+tensorReduce(void *input, size_t axis, MathOp func, float skip_ratio = 0.0f);
+void *tensorReduceHalf(
+    void *input, size_t axis, MathOp func, float skip_ratio = 0.0f);
 void *tensorProjectiveT(void *input, void *transformation);
 void *tensorMap1(MathOp f, void *i);
 void *tensorMap2(MathOp f2, void *i1, void *i2);
@@ -21,8 +23,8 @@ void *tensorMap3Half(MathOp f3, void *i1, void *i2, void *i3);
 // ***                      Wrapper API declaration                      *** //
 extern "C" {
 void *wrapper_tensorFft(const char *hpvm_node_id, void *input, bool inverse);
-void *wrapper_tensorReduce(
-    const char *hpvm_node_id, void *input, int axis, int func);
+void *
+wrapper_tensorReduce(const char *hpvm_node_id, void *input, int axis, int func);
 void *wrapper_tensorProjectiveT(
     const char *hpvm_node_id, void *input, void *transformation);
 void *wrapper_tensorMap1(const char *hpvm_node_id, int func, void *input);
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
index 6ebd8d52715e8509a62f7d6253d28586ca117b80..bf6664b0e87ce7fb68d0a8c0b992ba12e045c4d1 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h
@@ -1,3 +1,7 @@
+/*
+img_tensor_utils.h
+Util functions for image load/save, image quality calculation (PSNR), etc.
+*/
 #ifndef IMG_TENSOR_UTILS
 #define IMG_TENSOR_UTILS
 
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
index 163efd1a2dec403146ad83c1e00ef1cffa48222f..0e05813bb6eb5de86057bf3b2066c8fd98642e8d 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu
@@ -1,3 +1,4 @@
+/* device_math.cu defines */
 #include "device_math.h"
 #include "error.h"
 
@@ -18,8 +19,8 @@
 
 namespace _internal {
 
-// The following functions are not used, but they reference their cuda counterpart
-// which is necessary, otherwise ptx compilation breaks
+// The following functions are not used, but they reference their cuda
+// counterpart which is necessary, otherwise ptx compilation breaks
 __device__ float hypotf_(float x, float y) { return hypotf(x, y); }
 
 __device__ float atan2f_(float x, float y) { return atan2f(x, y); }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
index 9814e2c6a371b423a725c533c0029425f209fe1d..608950aa473948bc6c3663d88646c8080a5d56e1 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu
@@ -488,16 +488,14 @@ void *tensorMap2Half(MathOp f2, void *i1, void *i2) {
     profileEvent("H2F_end");
     profileEvent("#tensorMap2_end");
     return ret;
-  }
-  else if (common_ty == float2_type) {
+  } else if (common_ty == float2_type) {
     Tensor *ret = mapGeneral<half2, 2>(f2, {src1, src2});
     profileEvent("H2F_start");
     convertToFloat2Offline(ret);
     profileEvent("H2F_end");
     profileEvent("#tensorMap2_end");
     return ret;
-  }
-  else {
+  } else {
     ERROR("Type not recognized\n");
     return nullptr; // For some compilers
   }
diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
index bdce67ddedc24e7192b98bd3beff6aaec942a2d7..38ba3d4683cb60483d4ec5d56f8c21f8fd50a7fa 100644
--- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
+++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp
@@ -105,6 +105,7 @@ static Tensor *to_nchw(Tensor *t) {
 
 namespace fs = std::experimental::filesystem;
 
+// List all files in a folder.
 static inline std::vector<std::string> listFiles(const std::string &folder) {
   std::vector<std::string> ret;
   for (const auto &entry : fs::directory_iterator(folder))
@@ -113,6 +114,7 @@ static inline std::vector<std::string> listFiles(const std::string &folder) {
   return ret;
 }
 
+// return in[start:start+count]
 template <typename T>
 std::vector<T>
 sliceVector(const std::vector<T> &in, size_t start, size_t count) {
@@ -125,6 +127,7 @@ sliceVector(const std::vector<T> &in, size_t start, size_t count) {
   return std::vector<T>(slice_begin, slice_end);
 }
 
+// Read an image dataset from a folder with each image as a file.
 Tensor *
 readDataSet(const char *path, size_t start, size_t count, size_t n_color) {
   INFO("Loading image dataset from path %s\n", path);
@@ -166,6 +169,7 @@ readDataSet(const char *path, size_t start, size_t count, size_t n_color) {
   return nchw_batch;
 }
 
+// Convert complex-domain image to float valued image.
 static Tensor *complexToFloat(Tensor *batch) {
   convertAndGetGPUData<float2>(batch); // Convert to float2
   deviceToHostCopy(batch);
@@ -187,6 +191,7 @@ static Tensor *complexToFloat(Tensor *batch) {
   return ret;
 }
 
+// Save an image tensor image-by-image to a folder.
 void saveDataSet(
     const char *path, Tensor *batch, size_t start_idx, size_t write_n) {
   INFO("Saving image dataset to path %s\n", path);
@@ -228,6 +233,7 @@ void saveDataSet(
   }
 }
 
+// Load 1 file as an image into a tensor.
 void *loadAsImage(const char *filename, size_t n_color) {
   INFO("Loading image from path=%s\n", filename);
   int x, y, n; // x = width, y = height, n = # 8-bit components per pixel
@@ -244,6 +250,7 @@ void *loadAsImage(const char *filename, size_t n_color) {
   return nchw_image;
 }
 
+// Save 1 tensor as an image into a file.
 void saveToImage(const char *filename, Tensor *tensor) {
   INFO("Saving image data to path=%s\n", filename);
   deviceToHostCopy(tensor);
@@ -260,6 +267,7 @@ void saveToImage(const char *filename, Tensor *tensor) {
   delete[] ldr;
 }
 
+// Make a conv2d filter from 2-dim data.
 void *createFilterFromData(
     int data_type, void *data, size_t w, size_t h, size_t n_chan) {
   DEBUG("Creating filter from data\n");
@@ -278,6 +286,7 @@ void *createFilterFromData(
   return tensor;
 }
 
+// Normalize an image tensor.
 static void *normalize(void *image) {
   auto *max_1D = tensorReduce(image, 2, MathOp::Max);
   auto *max = tensorReduce(max_1D, 3, MathOp::Max);