diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h index 947556489de2fb94ef7d793dd46eb74db2240516..83781b148c4bb41619bbbb54d9e69cc9fc7f2543 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/device_math.h @@ -1,10 +1,15 @@ +/* +device_math.h +Provides pointer to CUDA math function and other properties ofa math operator +(one among MathOp) on a certain scalar type. +*/ #ifndef DEVICE_MATH_H #define DEVICE_MATH_H -#include <device_launch_parameters.h> #include <cuda_fp16.h> -#include <stdexcept> +#include <device_launch_parameters.h> #include <limits> +#include <stdexcept> #include "debug.h" @@ -24,12 +29,18 @@ enum class MathOp { PSNR }; +// Find the CUDA function for math operator `op`. +// This is ONLY defined (through template specialization, in device_math.cu) for +// float and half (see below). template <typename T> void *mathOpToFunc(MathOp op); template <> void *mathOpToFunc<float>(MathOp op); template <> void *mathOpToFunc<half>(MathOp op); +// Returns the identity element of math operator `op`, for example, -inf for +// MAX, 0 for ADD. +// Specialization exists for half type. template <typename T> T reduceOpToIdentity(MathOp op) { switch (op) { case MathOp::Hypot: diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h index bc85495e209eb8dcd53d8d5b1cc52bc3c4ddcf4e..71099a89e4ff1c47a14c4652556838e55c3850ea 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/broadcast.h @@ -1,11 +1,20 @@ +/* +broadcast.h +Calculates shape of two tensors broadcasted together, using a numpy-like (but +weaker) rule. +*/ + +#ifndef FUNCTIONAL_BROADCAST_H +#define FUNCTIONAL_BROADCAST_H + #include <algorithm> #include <array> #include <cstddef> #include <type_traits> #include "common.h" -#include "tensor.h" #include "debug.h" +#include "tensor.h" // TODO: don't accept N == 1 template <size_t N, typename std::enable_if<N >= 1, int>::type = 0> @@ -72,3 +81,5 @@ private: std::vector<size_t> out_sizes, sizes[N]; size_t tail_stride[N]; }; + +#endif // FUNCTIONAL_BROADCAST_H diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h index 76a26d249c2ef1dc9db196f80cd99c1054e12603..00326bef03b78d905f5923ae3ab5a79f327c2e7b 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/common.h @@ -1,5 +1,13 @@ -#ifndef IMAGE_PROCESSING_COMMON_H -#define IMAGE_PROCESSING_COMMON_H +/* +common.h +Helper functions shared among functional/* header files and their corresponding +*.cu files. +These include util functions for CUDA, or functions on __device__, or Tensor +methods that really should be in `struct Tensor`. +*/ + +#ifndef FUNCTIONAL_COMMON_H +#define FUNCTIONAL_COMMON_H #include <cuda_fp16.h> #include <cudnn.h> @@ -8,20 +16,15 @@ #include <vector> #include "debug.h" -#include "tensor.h" #include "profiling.h" +#include "tensor.h" +// Return ceil(a / b) for both host and device. template <typename T> __host__ __device__ __forceinline__ T ceilDiv(T a, T b) { return (a + b - 1) / b; } -template <typename T> __host__ T resolve_func_ptr(void *func_symbol_ptr) { - void *v_func_ptr = nullptr; - checkCudaErrors(cudaMemcpyFromSymbol( - &v_func_ptr, *(void **)func_symbol_ptr, sizeof(void *))); - return (T)v_func_ptr; -} - +// Profiles float -> half conversion, can be used like a guard. template <typename T> class HFProfileGuard { static const char *getEventName(bool end) { if (typeid(T) == typeid(half) || typeid(T) == typeid(half2)) @@ -31,6 +34,10 @@ template <typename T> class HFProfileGuard { } static bool needProfiling() { + // Only profile when given type T is half / half2. + // As this guard is often used in templated, scalar-type-agnostic + // implementation of an operator, this `T` is often that operator's scalar + // type. return typeid(T) == typeid(half) || typeid(T) == typeid(half2); } @@ -46,6 +53,8 @@ public: } }; +// Convert C++ type (given by template type T) to "tensor datatype", which is a +// enum that `struct Tensor` recognizes. template <typename T> int getTensorType() { if (typeid(T) == typeid(float)) return (int)float_type; @@ -61,6 +70,11 @@ template <typename T> int getTensorType() { } } +// Type-cast Tensor `t` to type `T` (regardless of what current type `t` has), +// and return a pointer to its underlying data on GPU (which can be t->gpu_data +// or t->gpu_half_data). +// This is specialized and implemented for float, float2 (float-complex), half, +// half2 (used for speeding up operations in half type) template <typename T> T *convertAndGetGPUData(Tensor *t); template <> float *convertAndGetGPUData<float>(Tensor *t); @@ -71,18 +85,35 @@ template <> half *convertAndGetGPUData<half>(Tensor *t); template <> half2 *convertAndGetGPUData<half2>(Tensor *t); +// Like convertAndGetGPUData, but calls `convertToFP32_offline` instead of +// `convertToFP32`, which makes a difference when online / offline profiling is +// involved. void convertToFloat2Offline(Tensor *t); +// Return sizes of tensor with a vector. std::vector<size_t> sizes(Tensor *t); std::vector<size_t> sizes(const Dimension &dim); +// Return total number of element in a tensor. size_t num_elems(const std::vector<size_t> &dim_sizes); size_t num_elems(const Dimension &dim); size_t num_elems(Tensor *t); +// Checks equivalence of types t1 and t2 under the assumption that float=half +// and float2=half2, and returns the equalized type. +// 1. Define an equivalence operator (==): +// t == t = True +// float == half = True +// float2 == half2 = True +// otherwise = False +// and throws if t1 != t2. +// 2. Returns the same type `t`. But as float is not _actually_ the same thing +// as half, `get_half` determines wh which one to return. E.g. with t1 == +// float2, t2 == half, if get_half == true, half2 is returned, otherwise float2 +// is returned. Tensor_type_t getCompatibleType(int t1, int t2, bool get_half); -#endif // IMAGE_PROCESSING_COMMON_H +#endif // FUNCTIONAL_COMMON_H diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh index 525afa9529fc25f9b3d7c3082980a5582771ef67..74568d8183a7a64f48750b4d02a6286224cac817 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map.cuh @@ -1,5 +1,10 @@ -#ifndef RUNTIME_MAP_H -#define RUNTIME_MAP_H +/* +map.cuh +Implementation of the map operator, including broadcast, the cuda kernel for +map, and a general map_n function in host code (which calls the kernel). +*/ +#ifndef FUNCTIONAL_MAP_H +#define FUNCTIONAL_MAP_H #include <array> #include <cstddef> @@ -13,17 +18,20 @@ #include "tensor.h" #include "tensor_utils.h" +// Checks dimension and data order of each map argument. template <size_t N> void mapPrecheck(const std::array<Tensor *, N> &srcs) { for (Tensor *src : srcs) { if (src->dims.num_dims != 4 || src->data_format != CUDNN_TENSOR_NCHW) - ERROR("Not supported\n"); // TODO: support this + ERROR("Not supported\n"); } } +// CUDA kernel for map_n. This is _actually_ mostly unused as specialization for +// float / half exists for performance benefit. template <typename Scalar, size_t N> __global__ void kernelMapBroadcast( - Scalar *target, unsigned num_rows, void *func, - Scalar **srcs, size_t *tail_strides) { + Scalar *target, unsigned num_rows, void *func, Scalar **srcs, + size_t *tail_strides) { auto *n_ary_op = (NTo1MapF<Scalar, N>)func; unsigned threadId = blockIdx.x * blockDim.x + threadIdx.x, @@ -39,19 +47,23 @@ __global__ void kernelMapBroadcast( // Instantiate float to compare fairly to half. Implemented for N = 1...3 template <size_t N> __global__ void kernelMapBroadcast<float, N>( - half *target, unsigned num_rows, void *func, - half **srcs, size_t *tail_strides); + half *target, unsigned num_rows, void *func, half **srcs, + size_t *tail_strides); // Half uses a different implementation. Implemented for N = 1, 2 template <size_t N> __global__ void kernelMapBroadcast<half, N>( - half *target, unsigned num_rows, void *func, - half **srcs, size_t *tail_strides); + half *target, unsigned num_rows, void *func, half **srcs, + size_t *tail_strides); +// Create parameter for cuda kernel by copying pointers to device (gpu). +// This function unwraps BroadcastRemap into a cuda array of size N -- one value +// for the broadcast stride of each map argument, and unwraps `srcs` into their +// gpu data pointers. template <typename Scalar, size_t N> std::tuple<size_t *, Scalar **> make_cuda_params( const BroadcastRemap<N> &br, const std::array<Tensor *, N> &srcs) { - for (Tensor *t: srcs) + for (Tensor *t : srcs) hostToDeviceCopy(t); std::array<Scalar *, N> gpu_datas; { @@ -73,6 +85,8 @@ std::tuple<size_t *, Scalar **> make_cuda_params( return std::make_tuple(cuda_strides, cuda_gpu_data); } +// Host code for map_n that check and converts the parameters, and calls the +// cuda kernel. template < typename Scalar, size_t N, typename std::enable_if<N >= 1, int>::type = 0> __host__ Tensor *mapGeneral(MathOp mop, const std::array<Tensor *, N> &srcs) { @@ -95,8 +109,8 @@ __host__ Tensor *mapGeneral(MathOp mop, const std::array<Tensor *, N> &srcs) { unsigned threads = std::min(max_threads, n_elem); unsigned grids = std::min(max_grid, ceilDiv(n_elem, threads)); kernelMapBroadcast<Scalar, N><<<grids, threads>>>( - convertAndGetGPUData<Scalar>(target), n_elem, func_ptr, - gpu_data, cuda_strides); + convertAndGetGPUData<Scalar>(target), n_elem, func_ptr, gpu_data, + cuda_strides); cudaDeviceSynchronize(); checkCUDA(cudaGetLastError()); return target; diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h index 4cb93661384a572425361a053371c409606de164..c6c804fa00f1ae5eb324d6928d8f3c43b1231d14 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/map_typing.h @@ -1,7 +1,11 @@ -#ifndef IMAGE_PROCESSING_MAP_TYPING_H -#define IMAGE_PROCESSING_MAP_TYPING_H +/* +map_typing.h +Helper for metaprogramming used by map.cuh. +Defines some recursively templated types and functions. +*/ +#ifndef FUNCTIONAL_MAP_TYPING_H +#define FUNCTIONAL_MAP_TYPING_H -// Constructs type T (*)(T, T, T, T, ... <n_times>) from T and N #include <cstddef> #include <device_launch_parameters.h> #include <tuple> @@ -17,9 +21,13 @@ struct _RepNType<T, W, std::index_sequence<Is...>> { using type = W<Type<T, Is>...>; }; +// Constructs type W<T, T, T, T, ... (N times)> from T and N +// RepNType T W N = W (T, T, T ... N times ..., T) template <typename T, template <typename...> typename W, size_t N> using RepNType = typename _RepNType<T, W, std::make_index_sequence<N>>::type; +// Like std::function<Ret(Args...)> but denotes function raw pointer instead of +// lambda function template <typename Ret, typename... Args> using FuncPtrT = Ret (*)(Args...); template <typename Ret, typename Arg, size_t N> struct _NAToBFunc { @@ -29,11 +37,15 @@ template <typename Ret, typename Arg, size_t N> struct _NAToBFunc { }; } // namespace +// NAToBF Ret Arg N = Ret(*)(Arg, Arg, ...N times) template <typename Ret, typename Arg, size_t N> using NAToBF = typename _NAToBFunc<Ret, Arg, N>::type; +// NTo1MapF Arg N = Arg(*)(Arg, Arg, ...N times) +// This denotes n-to-1 map: Arg x Arg x Arg x ... -> Arg. template <typename Scalar, size_t N> using NTo1MapF = NAToBF<Scalar, Scalar, N>; +// RepNTuple T N = std::tuple<Arg, Arg, ...N times> template <typename T, size_t N> using RepNTuple = RepNType<T, std::tuple, N>; namespace { @@ -49,20 +61,25 @@ __device__ auto call(Function f, Tuple t, std::index_sequence<I...>) { } } // namespace +// Converts Iterable of type T and length N to (same-typed) tuple +// std::tuple<T, T, T, T, ...> template <typename TIterable, typename T, size_t N> constexpr RepNTuple<T, N> as_tuple(TIterable arr) { return as_tuple<TIterable, T>(arr, std::make_index_sequence<N>{}); } +// Expands Tuple t into parameters of Function f, in python this would be +// f(*t). template <typename Function, typename Tuple> __device__ auto call_on_tuple(Function f, Tuple t) { static constexpr auto size = std::tuple_size<Tuple>::value; return call(f, t, std::make_index_sequence<size>{}); } +// Expands Array of type T and size N into parameters of Function template <typename Ret, typename T, size_t N> __device__ Ret call_on_c_array(NAToBF<Ret, T, N> f, const T arr[N]) { return call_on_tuple(f, as_tuple<const T *, T, N>(arr)); } -#endif // IMAGE_PROCESSING_MAP_TYPING_H +#endif // FUNCTIONAL_MAP_TYPING_H diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh index de5402942cf7be965a342ebfedf80d21475c49d3..9f4fabfb5e0b75017e901c2cb4c60d8649b04f07 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/functional/reduce.cuh @@ -1,3 +1,7 @@ +/* +reduce.cuh +Implementation for reduce operator. +*/ #include <device_launch_parameters.h> #include <functional> #include <numeric> @@ -76,7 +80,8 @@ __global__ void kernelReduceDimSeq( for (size_t irow = start_irow; irow < num_irows; irow += irow_stride) { K *src = src_ + orow * row_size * num_irows + irow; K *target = target_ + orow * num_irows + irow; - reduceAlongDim(target, src, init, identity, func, num_irows, approx_row_size); + reduceAlongDim( + target, src, init, identity, func, num_irows, approx_row_size); } } } @@ -101,16 +106,17 @@ __global__ void __launch_bounds__(NThreads) kernelReduceDimParallel( K *src = src_ + orow * row_size * num_irows + irow; K *target = target_ + orow * num_irows + irow; parallelReduceAlongDim( - target, src, init, identity, func, num_irows, approx_row_size, threadIdx.x, - blockDim.x); + target, src, init, identity, func, num_irows, approx_row_size, + threadIdx.x, blockDim.x); } } } +/* Entry point for `reduce` implementation. Calls the right version of reduction + * kernel as needed. */ template <typename Scalar> __host__ Tensor *reduceDim( - Tensor *src, const Scalar &init, MathOp op, size_t axis, - float skip_rate) { + Tensor *src, const Scalar &init, MathOp op, size_t axis, float skip_rate) { // Copy input over hostToDeviceCopy(src); @@ -156,8 +162,8 @@ __host__ Tensor *reduceDim( dim3 threads(AlongDimTh, CrossDimTh); dim3 grid(grid_x, grid_y); kernelReduceDimParallel<Scalar><<<grid, threads>>>( - convertAndGetGPUData<Scalar>(target), src_data, - init, identity, func, num_irows, num_orows, row_size, approx_row_size); + convertAndGetGPUData<Scalar>(target), src_data, init, identity, func, + num_irows, num_orows, row_size, approx_row_size); } else { DEBUG( "Reducing sequentially, row size = %lu, actually using %lu\n", row_size, @@ -169,8 +175,8 @@ __host__ Tensor *reduceDim( std::min(MaxBlocksPerDim, num_orows), ceilDiv(MaxNBlocks, grid_x)); dim3 grid(grid_x, grid_y); kernelReduceDimSeq<Scalar><<<grid, threads>>>( - convertAndGetGPUData<Scalar>(target), src_data, - init, identity, func, num_irows, num_orows, row_size, approx_row_size); + convertAndGetGPUData<Scalar>(target), src_data, init, identity, func, + num_irows, num_orows, row_size, approx_row_size); } cudaDeviceSynchronize(); checkCUDA(cudaGetLastError()); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h index 708c52917a3fc2e4eb92da7526350346f85ca459..608107e1dfb39bb268899227dc21f45d969de1f7 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_runtime.h @@ -8,8 +8,10 @@ // *** Runtime declaration *** // void *tensorFft(void *input, bool inverse); void *tensorFftHalf(void *input, bool inverse); -void *tensorReduce(void *input, size_t axis, MathOp func, float skip_ratio = 0.0f); -void *tensorReduceHalf(void *input, size_t axis, MathOp func, float skip_ratio = 0.0f); +void * +tensorReduce(void *input, size_t axis, MathOp func, float skip_ratio = 0.0f); +void *tensorReduceHalf( + void *input, size_t axis, MathOp func, float skip_ratio = 0.0f); void *tensorProjectiveT(void *input, void *transformation); void *tensorMap1(MathOp f, void *i); void *tensorMap2(MathOp f2, void *i1, void *i2); @@ -21,8 +23,8 @@ void *tensorMap3Half(MathOp f3, void *i1, void *i2, void *i3); // *** Wrapper API declaration *** // extern "C" { void *wrapper_tensorFft(const char *hpvm_node_id, void *input, bool inverse); -void *wrapper_tensorReduce( - const char *hpvm_node_id, void *input, int axis, int func); +void * +wrapper_tensorReduce(const char *hpvm_node_id, void *input, int axis, int func); void *wrapper_tensorProjectiveT( const char *hpvm_node_id, void *input, void *transformation); void *wrapper_tensorMap1(const char *hpvm_node_id, int func, void *input); diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h index 6ebd8d52715e8509a62f7d6253d28586ca117b80..bf6664b0e87ce7fb68d0a8c0b992ba12e045c4d1 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/include/img_tensor_utils.h @@ -1,3 +1,7 @@ +/* +img_tensor_utils.h +Util functions for image load/save, image quality calculation (PSNR), etc. +*/ #ifndef IMG_TENSOR_UTILS #define IMG_TENSOR_UTILS diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu index 163efd1a2dec403146ad83c1e00ef1cffa48222f..0e05813bb6eb5de86057bf3b2066c8fd98642e8d 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/device_math.cu @@ -1,3 +1,4 @@ +/* device_math.cu defines */ #include "device_math.h" #include "error.h" @@ -18,8 +19,8 @@ namespace _internal { -// The following functions are not used, but they reference their cuda counterpart -// which is necessary, otherwise ptx compilation breaks +// The following functions are not used, but they reference their cuda +// counterpart which is necessary, otherwise ptx compilation breaks __device__ float hypotf_(float x, float y) { return hypotf(x, y); } __device__ float atan2f_(float x, float y) { return atan2f(x, y); } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu index 9814e2c6a371b423a725c533c0029425f209fe1d..608950aa473948bc6c3663d88646c8080a5d56e1 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_runtime.cu @@ -488,16 +488,14 @@ void *tensorMap2Half(MathOp f2, void *i1, void *i2) { profileEvent("H2F_end"); profileEvent("#tensorMap2_end"); return ret; - } - else if (common_ty == float2_type) { + } else if (common_ty == float2_type) { Tensor *ret = mapGeneral<half2, 2>(f2, {src1, src2}); profileEvent("H2F_start"); convertToFloat2Offline(ret); profileEvent("H2F_end"); profileEvent("#tensorMap2_end"); return ret; - } - else { + } else { ERROR("Type not recognized\n"); return nullptr; // For some compilers } diff --git a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp index bdce67ddedc24e7192b98bd3beff6aaec942a2d7..38ba3d4683cb60483d4ec5d56f8c21f8fd50a7fa 100644 --- a/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp +++ b/llvm/projects/hpvm-tensor-rt/tensor_runtime/src/img_tensor_utils.cpp @@ -105,6 +105,7 @@ static Tensor *to_nchw(Tensor *t) { namespace fs = std::experimental::filesystem; +// List all files in a folder. static inline std::vector<std::string> listFiles(const std::string &folder) { std::vector<std::string> ret; for (const auto &entry : fs::directory_iterator(folder)) @@ -113,6 +114,7 @@ static inline std::vector<std::string> listFiles(const std::string &folder) { return ret; } +// return in[start:start+count] template <typename T> std::vector<T> sliceVector(const std::vector<T> &in, size_t start, size_t count) { @@ -125,6 +127,7 @@ sliceVector(const std::vector<T> &in, size_t start, size_t count) { return std::vector<T>(slice_begin, slice_end); } +// Read an image dataset from a folder with each image as a file. Tensor * readDataSet(const char *path, size_t start, size_t count, size_t n_color) { INFO("Loading image dataset from path %s\n", path); @@ -166,6 +169,7 @@ readDataSet(const char *path, size_t start, size_t count, size_t n_color) { return nchw_batch; } +// Convert complex-domain image to float valued image. static Tensor *complexToFloat(Tensor *batch) { convertAndGetGPUData<float2>(batch); // Convert to float2 deviceToHostCopy(batch); @@ -187,6 +191,7 @@ static Tensor *complexToFloat(Tensor *batch) { return ret; } +// Save an image tensor image-by-image to a folder. void saveDataSet( const char *path, Tensor *batch, size_t start_idx, size_t write_n) { INFO("Saving image dataset to path %s\n", path); @@ -228,6 +233,7 @@ void saveDataSet( } } +// Load 1 file as an image into a tensor. void *loadAsImage(const char *filename, size_t n_color) { INFO("Loading image from path=%s\n", filename); int x, y, n; // x = width, y = height, n = # 8-bit components per pixel @@ -244,6 +250,7 @@ void *loadAsImage(const char *filename, size_t n_color) { return nchw_image; } +// Save 1 tensor as an image into a file. void saveToImage(const char *filename, Tensor *tensor) { INFO("Saving image data to path=%s\n", filename); deviceToHostCopy(tensor); @@ -260,6 +267,7 @@ void saveToImage(const char *filename, Tensor *tensor) { delete[] ldr; } +// Make a conv2d filter from 2-dim data. void *createFilterFromData( int data_type, void *data, size_t w, size_t h, size_t n_chan) { DEBUG("Creating filter from data\n"); @@ -278,6 +286,7 @@ void *createFilterFromData( return tensor; } +// Normalize an image tensor. static void *normalize(void *image) { auto *max_1D = tensorReduce(image, 2, MathOp::Max); auto *max = tensorReduce(max_1D, 3, MathOp::Max);