From b40b0a7cf2744e68e9f89e14eca46c98e8cb9a70 Mon Sep 17 00:00:00 2001
From: Chris Maffeo <cmaffeo2@illinois.edu>
Date: Tue, 25 Apr 2023 14:40:52 -0500
Subject: [PATCH] Update

---
 src/CMakeLists.txt             |   1 +
 src/Interaction.h              |   2 +-
 src/PatchOp.cuh                |  95 +++++++++++
 src/PatchOp.h                  |   1 +
 src/Tests/CMakeLists.txt       |   1 +
 src/Tests/matrix3.cu           | 297 +++++++++++++++++++++++++++++++++
 src/Tests/type_name.h          |   2 +-
 src/Tests/vector3_precision.cu | 143 ++++++++--------
 src/Types.h                    |  30 +++-
 src/Types/Matrix3.h            | 234 ++++++++++++++++++++++++++
 src/Types/Vector3.h            |  48 ++++--
 11 files changed, 764 insertions(+), 90 deletions(-)
 create mode 100644 src/PatchOp.cuh
 create mode 100644 src/Tests/matrix3.cu
 create mode 100644 src/Types/Matrix3.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4c3fa1e..fb681eb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,4 +8,5 @@ add_library("lib${PROJECT_NAME}"
   Interaction/CPU.cpp
   Interaction/CUDA.cu
   SignalManager.cpp
+  PatchOp.cu
 )
diff --git a/src/Interaction.h b/src/Interaction.h
index 93a5f27..a088381 100644
--- a/src/Interaction.h
+++ b/src/Interaction.h
@@ -32,7 +32,7 @@ public:
      * @brief Struct for defining interaction configuration
      */
     struct Conf {
-	    enum Object    {Particle, RigidBody };
+	    enum Object    {Particle, RigidBody};
 	    enum DoF {Bond, Angle, Dihedral, Bonded, NeighborhoodPair};
 	    enum Form {Harmonic, Tabulated, LennardJones, Coulomb, LJ};
 	    enum Backend   { Default, CUDA, CPU };
diff --git a/src/PatchOp.cuh b/src/PatchOp.cuh
new file mode 100644
index 0000000..52bf95b
--- /dev/null
+++ b/src/PatchOp.cuh
@@ -0,0 +1,95 @@
+#pragma once
+#include "PatchOp.h"
+
+struct ParticleType {
+    float diffusion = 1;
+};
+struct SimParameters {
+    float timestep = 1;
+    float kT = 1;
+};
+
+struct Random {
+    __host__ __device__ Vector3 gaussian_vector() {
+	return Vector3(0.0f);
+    }
+    int seed = 1;
+};
+
+
+namespace LocalOneParticleLoop {
+
+    namespace BD {
+	template<typename ...Op_t>
+	__global__ void op_kernel(size_t num, Vector3* __restrict__ pos_force, size_t* type_ids, ParticleType* types, SimParameters& sim, Random& random) {
+	    for (size_t i = threadIdx.x + blockIdx.x*blockDim.x; i < num; i+= gridDim.x*blockDim.x) {
+		Vector3& pos = pos_force[i];
+		Vector3& force = pos_force[num+i];
+		ParticleType& t = types[type_ids[i]];
+		
+		using expander = int[];
+		(void)expander{0, (void(Op_t::op(pos, force, t, sim, random)))...};
+	    }
+	}
+    }
+    
+    namespace MD {
+	template<typename ...Op_t>
+	__global__ void op_kernel(size_t num, Vector3* __restrict__ pos_force, size_t* type_ids, ParticleType* types) {
+	    for (size_t i = threadIdx.x + blockIdx.x*blockDim.x; i < num; i+= gridDim.x*blockDim.x) {
+		Vector3& pos = pos_force[i];
+		Vector3& mom = pos_force[num+i];
+		Vector3& force = pos_force[2*num+i];
+		ParticleType& t = types[type_ids[i]];
+	    
+		using expander = int[];
+		(void)expander{0, (void(Op_t::op(&pos, &force, t, &mom)))...};	
+	    }
+	}
+    }
+
+    template<size_t block_size=32, bool is_BD=true, typename ...Op_t>
+    class LocalOneParticleLoop : public BasePatchOp {
+	void compute(Patch* patch) {
+	    size_t num_blocks = (patch->num+1)/block_size;
+	    if (is_BD) {
+		BD::op_kernel<Op_t...><<<block_size, num_blocks>>>(patch->num, patch->pos, patch->type_ids, patch->types);
+	    } else {
+		MD::op_kernel<Op_t...><<<block_size, num_blocks>>>(patch->num, patch->pos, patch->type_ids, patch->types);
+	    }
+	}
+    };
+
+    struct OpIntegrateBD {
+	HOST DEVICE static void op(Vector3& __restrict__ pos,
+				   Vector3& __restrict__ force,
+				   ParticleType& type,
+				   SimParameters& sim,
+				   Random& random,
+				   Vector3* __restrict__ mom = nullptr) {
+
+	    const float Dt = type.diffusion*sim.timestep;
+	    Vector3 R = random.gaussian_vector(); // TODO who owns "random" object; how is it's state stored and recalled (especially if hardware changes); how is the implementation efficient on HOST/DEVICE?
+
+	    const Vector3 new_pos = pos + (Dt / sim.kT) * force + sqrtf(2.0f * Dt) * R;
+	    // pos = sim.wrap(new_pos); // TODO decide how wrapping will be handled
+	}
+    };
+
+    struct OpComputeForceBD {
+	HOST DEVICE static void op(Vector3& __restrict__ pos,
+				   Vector3& __restrict__ force,
+				   ParticleType& type,
+				   SimParameters& sim,
+				   Random& random,
+				   Vector3* __restrict__ mom = nullptr) {
+
+	    const float Dt = type.diffusion*sim.timestep;
+	    Vector3 R = random.gaussian_vector(); // TODO who owns "random" object; how is it's state stored and recalled (especially if hardware changes); how is the implementation efficient on HOST/DEVICE?
+
+	    const Vector3 new_pos = pos + (Dt / sim.kT) * force + sqrtf(2.0f * Dt) * R;
+	    // pos = sim.wrap(new_pos); // TODO decide how wrapping will be handled
+	}
+    };
+
+}
diff --git a/src/PatchOp.h b/src/PatchOp.h
index ca4ce36..f660b76 100644
--- a/src/PatchOp.h
+++ b/src/PatchOp.h
@@ -44,5 +44,6 @@ private:
     void* compute_data;
 };
 
+
 #include "Integrator.h"
 #include "Interaction.h"
diff --git a/src/Tests/CMakeLists.txt b/src/Tests/CMakeLists.txt
index f85436c..15fb2f2 100644
--- a/src/Tests/CMakeLists.txt
+++ b/src/Tests/CMakeLists.txt
@@ -6,6 +6,7 @@ include(CTest)
 
 ## TODO, add a macro to add these projects, build with CUDA
 add_executable("arbd_tests"
+matrix3.cu
 vector3_precision.cu
 )
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
diff --git a/src/Tests/matrix3.cu b/src/Tests/matrix3.cu
new file mode 100644
index 0000000..4b54306
--- /dev/null
+++ b/src/Tests/matrix3.cu
@@ -0,0 +1,297 @@
+#include <iostream>
+#include <cstdio>
+
+#include "../SignalManager.h"
+#include "../Types.h"
+#include <cuda.h>
+#include <nvfunctional>
+
+#include "type_name.h"
+
+#include <catch2/catch_tostring.hpp>
+namespace Catch {
+    template<typename T, bool b1, bool b2>
+    struct StringMaker<Matrix3_t<T,b1,b2>> {
+        static std::string convert( Matrix3_t<T,b1,b2> const& value ) {
+            return value.to_string();
+        }
+    };
+}
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+
+
+namespace Tests {
+
+    template<typename Op_t, typename R, typename ...T>
+    __global__ void op_kernel(R* result, T...in) {
+	if (blockIdx.x == 0) {
+	    *result = Op_t::op(in...);
+	}
+    }
+
+
+    // In C++14, how can I unpack a tuple so its elements are passed to a function as arguments?
+    template <typename F, typename Tuple, size_t... I>
+    decltype(auto) apply_impl(F&& f, Tuple&& t, std::index_sequence<I...>)
+    {
+	return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
+    }
+    template <typename F, typename Tuple>
+    decltype(auto) apply(F&& f, Tuple&& t)
+    {
+	using Indices = std::make_index_sequence<std::tuple_size<std::decay_t<Tuple>>::value>;
+	return apply_impl(std::forward<F>(f), std::forward<Tuple>(t), Indices{});
+    }
+
+    template <typename F, typename Tuple, size_t... I>
+    decltype(auto) apply_each_impl(F&& f, Tuple&& t, std::index_sequence<I...>)
+    {
+	return std::forward<F>(f)(std::get<I>(std::forward<Tuple>(t))...);
+    }
+    template <typename F, typename Tuple>
+    decltype(auto) apply_each(F&& f, Tuple&& t)
+    {
+	using Indices = std::make_index_sequence<std::tuple_size<std::decay_t<Tuple>>::value>;
+	return apply_each_impl(std::forward<F>(f), std::forward<Tuple>(t), Indices{});
+    }
+
+    template<typename T>
+    void call_info(T t) {
+	// INFO( type_name(t) );
+	// UNSCOPED_INFO( type_name<T>() << " " << t );
+    }
+    // template<>
+    // void call_info(double t) {
+    // 	UNSCOPED_INFO( "double " << t );
+    // }
+    // template<>
+    // void call_info(float t) {
+    // 	UNSCOPED_INFO( "double " << t );
+    // 	// INFO( type_name(t) );
+    // 	INFO( "float" );
+    // }
+    // template<>
+    // void call_info(int t) {
+    // 	// INFO( type_name(t) );
+    // 	INFO( "int" );
+    // }
+    
+    template<typename Op_t, typename R, typename ...T>
+    void run_trial( std::string name, R expected_result, T...args) {
+	R *gpu_result_d, gpu_result, cpu_result;
+	cpu_result = Op_t::op(args...);
+	cudaMalloc((void **)&gpu_result_d, sizeof(R));
+	
+	op_kernel<Op_t, R, T...><<<1,1>>>(gpu_result_d, args...);
+	cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+	
+	INFO( name );
+	// INFO( type_name<T...>() );
+	
+	// auto tmp = std::make_tuple(args...);
+	// for (int i = 0; i < sizeof(T...); ++i) {
+	//     auto& arg = std::get<i>(tmp);
+	//     CAPTURE( arg );
+	// }
+	// auto fn = [](auto a) { CAPTURE( a ); };
+
+	// auto fn = [](auto a) { INFO( a->to_string() ); };
+	// using expander = int[];
+	// (void)expander{0, (void(call_info<T>(std::forward<T>(args))), 0)...};
+
+	CAPTURE( cpu_result );
+	CAPTURE( expected_result );
+	
+	REQUIRE( cpu_result == expected_result );
+	CHECK( cpu_result == gpu_result );
+    }
+}
+
+namespace Tests::Unary {
+    template<typename R, typename T>
+    struct NegateOp { HOST DEVICE static R op(T in) { return static_cast<R>(-in); } };
+
+    template<typename R, typename T>
+    struct NormalizedOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } };
+
+    namespace Matrix3 {
+	template<typename R, typename T>
+	struct DeterminantOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.det()); } };
+
+	template<typename R, typename T>
+	struct NormalizeDetOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized().det()); } };
+
+	// template<typename R, typename T>
+	// struct NormalizeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.normalized()); } };
+
+
+	
+	template<typename R, typename T>
+	struct InverseOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.inverse()); } };
+
+	template<typename R, typename T>
+	struct TransposeOp { HOST DEVICE static R op(T in) { return static_cast<R>(in.transpose()); } };
+    }
+}
+
+namespace Tests::Binary {
+    // R is return type, T and U are types of operands
+    template<typename R, typename T, typename U> 
+    struct AddOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a+b); } };
+    template<typename R, typename T, typename U> 
+    struct SubOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a-b); } };
+    template<typename R, typename T, typename U> 
+    struct MultOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a*b); } };
+    template<typename R, typename T, typename U> 
+    struct DivOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a/b); } };
+
+    namespace Matrix3 {
+	template<typename R, typename T, typename U> 
+	struct CrossOp { HOST DEVICE static R op(T a, U b) { return static_cast<R>(a.cross(b)); } };
+    }
+}
+
+namespace Tests::Unary::Matrix3 {
+
+    template<typename A, bool is_diag=false, bool check_diag=false>
+    void run_tests() {
+	using T = Matrix3_t<A, is_diag, check_diag>;
+	// std::ostream test_info;
+	INFO( "Testing " << type_name<T>() << " unary operators that" );
+	{
+	    using R = A;
+	    INFO(  "    return " << type_name<R>() );
+	    run_trial<DeterminantOp<R,T>,R,T>( "Testing determinant", R(6), T(1,2,3) );
+	    run_trial<NormalizeDetOp<R,T>,R,T>( "Testing that normalized matrix has determinant == 1", R(1), T(1,1,1) );
+	    // run_trial<NormalizeDetOp<R,T>,R,T>( "Testing that normalized matrix has determinant == 1", R(1), T(2,2,2) );
+	    // run_trial<NormalizeDetOp<R,T>,R,T>( "Testing that normalized matrix has determinant == 1", R(1), T(1,2,3) );
+	}
+
+	{ // Test operators that return Matrix
+	    using R = T;
+	    INFO( "    return " << type_name<R>() );
+	    run_trial<TransposeOp<R,T>,R,T>( "Testing transpose",
+					     R(1,4,7,
+					       2,5,8,
+					       3,6,9),
+					     T(1,2,3,
+					       4,5,6,
+					       7,8,9) );
+	    run_trial<NegateOp<R,T>,R,T>( "Testing negation", R(-1,-2,-3), T(1,2,3) );
+	    run_trial<InverseOp<R,T>,R,T>( "Testing inversion",
+					   R(3.0/16, 0.25, -5.0/16,
+					     0.25, 0, 0.25, -5.0/16,
+					     0.25, 3.0/16),
+					   T(1,2,-1,
+					     2,1,2,
+					     -1,2,1) );
+	    run_trial<NormalizedOp<R,T>,R,T>( "Testing Matrix3_t<>.normalized()", R(1,1,1), T(1,1,1) );
+	    run_trial<NormalizedOp<R,T>,R,T>( "Testing Matrix3_t<>.normalized()", R(1,1,1), T(2,2,2) );
+	    run_trial<NormalizedOp<R,T>,R,T>( "Testing Matrix3_t<>.normalized()", R(A(1.0/6),A(2.0/6),A(3.0/6)), T(1,2,3) );
+	}
+    }
+    TEST_CASE( "Check that Matrix3_t unary operations are identical on GPU and CPU", "[Tests::Unary::Matrix3]" ) {
+	// INFO("Test case start");
+	const bool is_diag = false;
+	const bool check_diag = true;
+	run_tests<double, is_diag, check_diag>();
+	run_tests<float, is_diag, check_diag>();
+	run_tests<int, is_diag, check_diag>();
+    }
+}
+
+namespace Tests::Binary::Matrix3 {
+
+    template<typename A, typename B, bool is_diag=false, bool check_diag=false>
+    void run_tests() {
+	using T = Matrix3_t<A, is_diag, check_diag>;
+
+	{ // Test binary operators that return Matrix where U b is a scalar
+	    using U = B;
+	    using R = T;
+	    run_trial<MultOp<R,T,U>,R,T,U>( "Scale", R(2,4,6), T(1,2,3), U(2) );
+	}
+
+	{ // Test binary operators that return Vector where U b is Vector
+	    using U = Vector3_t<B>;
+	    using R = U;
+	    run_trial<MultOp<R,T,U>,R,T,U>( "Matrix.transform(Vector)",
+					    R(1+4+9,
+					      4+10+18,
+					      7+16+27),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9),
+					    U(1,2,3) );
+	}
+
+	{ // Test binary operators that return Vector and U b is Vector
+	    using U = Vector3_t<B>;
+	    using R = U;
+	    run_trial<MultOp<R,T,U>,R,T,U>( "Matrix element multiplication",
+					    R(1+4+9,
+					      4+10+18,
+					      7+16+27),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9),
+					    U(1,2,3) );
+	}
+
+	{ // Test binary operators that return Matrix and U b is Matrix
+	    using U = Matrix3_t<B,is_diag,check_diag>;
+	    using R = std::common_type_t<T,U>;
+	    run_trial<AddOp<R,T,U>,R,T,U>( "Matrix addition",
+					    R(2,4,6,
+					      8,10,12,
+					      14,16,18),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9));
+
+	    run_trial<SubOp<R,T,U>,R,T,U>( "Matrix subtraction",
+					    R(0,0,0),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9) );
+
+	    run_trial<MultOp<R,T,U>,R,T,U>( "Matrix transformation",
+					    R(1+8+21, 2+10+24, 3+12+27,
+					      4+20+42, 8+25+48, 12+30+54,
+					      7+32+63, 14+40+72, 21+48+81),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9),
+					    T(1,2,3,
+					      4,5,6,
+					      7,8,9) );
+	}	
+    }
+    TEST_CASE( "Check that Matrix3_t binary operations are identical on GPU and CPU", "[Tests::Binary::Matrix3]" ) {
+	// INFO("Test case start");
+	const bool is_diag = false;
+	const bool check_diag = true;
+	run_tests<double,double, is_diag, check_diag>();
+	// run_tests<float,double, is_diag, check_diag>();
+	//run_tests<int,double, is_diag, check_diag>();
+
+	//run_tests<double,float, is_diag, check_diag>();
+	run_tests<float,float, is_diag, check_diag>();
+	// run_tests<int,float, is_diag, check_diag>();
+
+	//run_tests<double,int, is_diag, check_diag>();
+	//run_tests<float,int, is_diag, check_diag>();
+
+	// run_tests<int,int, is_diag, check_diag>();
+
+    }
+}
diff --git a/src/Tests/type_name.h b/src/Tests/type_name.h
index 90e8309..f64e440 100644
--- a/src/Tests/type_name.h
+++ b/src/Tests/type_name.h
@@ -7,7 +7,7 @@
 #include <string>
 #include <cstdlib>
 
-template <typename T>
+template <typename T, typename ...Extras>
 std::string type_name() {
     using TR = typename std::remove_reference<T>::type;
     std::unique_ptr<char, void(*)(void*)> own
diff --git a/src/Tests/vector3_precision.cu b/src/Tests/vector3_precision.cu
index b02c3b4..a8b93db 100644
--- a/src/Tests/vector3_precision.cu
+++ b/src/Tests/vector3_precision.cu
@@ -13,92 +13,93 @@
 
 #include "type_name.h"
 
+namespace Tests::Vector3 {
+    enum BinaryOp_t { ADD, CROSS, DOT, SUB, FINAL };
+    BinaryOp_t& operator++(BinaryOp_t& op) { return op = static_cast<BinaryOp_t>( 1+static_cast<int>(op) ); }
 
-enum BinaryOp_t { ADD, CROSS, DOT, SUB, FINAL };
-BinaryOp_t& operator++(BinaryOp_t& op) { return op = static_cast<BinaryOp_t>( 1+static_cast<int>(op) ); }
-
-std::string get_binary_op_name( BinaryOp_t op ) {
-    switch (op) {
-    case ADD:
-	return "add";
-    case SUB:
-	return "subtract";
-    case CROSS:
-	return "cross";
-    case DOT:
-	return "dot";
+    std::string get_binary_op_name( BinaryOp_t op ) {
+	switch (op) {
+	case ADD:
+	    return "add";
+	case SUB:
+	    return "subtract";
+	case CROSS:
+	    return "cross";
+	case DOT:
+	    return "dot";
+	}
+	return std::string(""); // (static_cast<int>(op)));
     }
-    return std::string(""); // (static_cast<int>(op)));
-}
 
-template<typename R, typename T, typename U>
-__host__ __device__ nvstd::function<R(T,U)> get_binary_op_func( BinaryOp_t op) {
-    switch (op) {
-    case ADD:
+    template<typename R, typename T, typename U>
+    __host__ __device__ nvstd::function<R(T,U)> get_binary_op_func( BinaryOp_t op) {
+	switch (op) {
+	case ADD:
+	    return [] (T a, U b) {return static_cast<R>(b+a);};
+	case SUB:
+	    return [] (T a, U b) {return static_cast<R>(b-a);};
+	case CROSS:
+	    return [] (T a, U b) {return static_cast<R>(b.cross(a));};
+	case DOT:
+	    return [] (T a, U b) {return static_cast<R>(b.dot(a));};
+	default:
+	    assert(false);
+	}
 	return [] (T a, U b) {return static_cast<R>(b+a);};
-    case SUB:
-	return [] (T a, U b) {return static_cast<R>(b-a);};
-    case CROSS:
-	return [] (T a, U b) {return static_cast<R>(b.cross(a));};
-    case DOT:
-	return [] (T a, U b) {return static_cast<R>(b.dot(a));};
-    default:
-	assert(false);
     }
-    return [] (T a, U b) {return static_cast<R>(b+a);};
-}
 
-template<typename R, typename T, typename U>
-__global__ void binary_op_test_kernel( BinaryOp_t op, R* result, T in1, U in2 ) {
-    nvstd::function<R(T,U)> fn = get_binary_op_func<R,T,U>(op);
-    if (blockIdx.x == 0) {
-	*result = fn(in1,in2);
+    template<typename R, typename T, typename U>
+    __global__ void binary_op_test_kernel( BinaryOp_t op, R* result, T in1, U in2 ) {
+	nvstd::function<R(T,U)> fn = get_binary_op_func<R,T,U>(op);
+	if (blockIdx.x == 0) {
+	    *result = fn(in1,in2);
+	}
     }
-}
 
-template<typename T, typename U>
-void check_vectors_equal( T&& cpu, U&& gpu) {
-    CHECK( type_name<decltype(cpu)>() == type_name<decltype(gpu)>() ); // should be unneccesary
-    CHECK( cpu.x == gpu.x );
-    CHECK( cpu.y == gpu.y );
-    CHECK( cpu.z == gpu.z );
-    CHECK( cpu.w == gpu.w );
-}
+    template<typename T, typename U>
+    void check_vectors_equal( T&& cpu, U&& gpu) {
+	CHECK( type_name<decltype(cpu)>() == type_name<decltype(gpu)>() ); // should be unneccesary
+	CHECK( cpu.x == gpu.x );
+	CHECK( cpu.y == gpu.y );
+	CHECK( cpu.z == gpu.z );
+	CHECK( cpu.w == gpu.w );
+    }
 
-template<typename A, typename B>
-void run_tests() {
-    using T = Vector3_t<A>;
-    using U = Vector3_t<B>;
-    using R = std::common_type_t<T,U>;
+    template<typename A, typename B>
+    void run_tests() {
+	using T = Vector3_t<A>;
+	using U = Vector3_t<B>;
+	using R = std::common_type_t<T,U>;
     
-    T v1(1,1.005,0);
-    U v2(0,2,0);
-    R *gpu_result_d, gpu_result, cpu_result;
-    cudaMalloc((void **)&gpu_result_d, sizeof(R));
+	T v1(1,1.005,0);
+	U v2(0,2,0);
+	R *gpu_result_d, gpu_result, cpu_result;
+	cudaMalloc((void **)&gpu_result_d, sizeof(R));
 
-    for (BinaryOp_t op = ADD; op < FINAL; ++op) {
-	INFO( get_binary_op_name( op ) );
-	binary_op_test_kernel<R,T,U><<<1,1>>>(op, gpu_result_d, v1, v2);
-	cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);
-	cudaDeviceSynchronize();
+	for (BinaryOp_t op = ADD; op < FINAL; ++op) {
+	    INFO( get_binary_op_name( op ) );
+	    binary_op_test_kernel<R,T,U><<<1,1>>>(op, gpu_result_d, v1, v2);
+	    cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);
+	    cudaDeviceSynchronize();
 	
-	// Get cpu_result
-	cpu_result = (get_binary_op_func<R,T,U>(op))(v1,v2);
+	    // Get cpu_result
+	    cpu_result = (get_binary_op_func<R,T,U>(op))(v1,v2);
 
-	// Check consistency
-	check_vectors_equal(cpu_result, gpu_result);
+	    // Check consistency
+	    check_vectors_equal(cpu_result, gpu_result);
+	}
+	cudaFree(gpu_result_d);
     }
-    cudaFree(gpu_result_d);
-}
 
-TEST_CASE( "Check that Vector3_t binary operations are identical on GPU and CPU", "[Vector3]" ) {
-    // INFO("Test case start");
+    TEST_CASE( "Check that Vector3_t binary operations are identical on GPU and CPU", "[Vector3]" ) {
+	// INFO("Test case start");
     
-    run_tests<int,int>();
-    run_tests<float,float>();
-    run_tests<double,double>();
+	run_tests<int,int>();
+	run_tests<float,float>();
+	run_tests<double,double>();
 
-    run_tests<float,double>();
-    run_tests<double,float>();
-    run_tests<int,double>();
+	run_tests<float,double>();
+	run_tests<double,float>();
+	run_tests<int,double>();
+    }
 }
diff --git a/src/Types.h b/src/Types.h
index 6cfa33e..765b1ab 100644
--- a/src/Types.h
+++ b/src/Types.h
@@ -21,15 +21,11 @@
 #ifdef USE_CUDA
 #include <cuda_runtime.h>
 #else
-
 struct MY_ALIGN(16) float4 {
     float4() : x(0), y(0), z(0), w(0) {};
     float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {};
     float4 operator+(const float4&& o) {
 	return float4(x+o.x,y+o.y,z+o.z,w+o.w);
-	// float4 r;
-	// r.x = x+o.x; r.y = y+o.y; r.z = z+o.z; r.w = w+o.w;
-	// return r;
     };
     float4 operator*(const float&& s) {
 	return float4(x*s,y*s,z*s,w*s);
@@ -39,9 +35,31 @@ struct MY_ALIGN(16) float4 {
 };
 #endif
 
+#include <stdarg.h>  // For va_start, etc.
+#include <memory>    // For std::unique_ptr
+#include <cstring>
+
+// from: https://stackoverflow.com/questions/2342162/stdstring-formatting-like-sprintf/8098080#8098080
+inline std::string string_format(const std::string fmt_str, ...) {
+    int final_n, n = ((int)fmt_str.size()) * 2; /* Reserve two times as much as the length of the fmt_str */
+    std::unique_ptr<char[]> formatted;
+    va_list ap;
+    while(1) {
+        formatted.reset(new char[n]); /* Wrap the plain char array into the unique_ptr */
+        strcpy(&formatted[0], fmt_str.c_str());
+        va_start(ap, fmt_str);
+        final_n = vsnprintf(&formatted[0], n, fmt_str.c_str(), ap);
+        va_end(ap);
+        if (final_n < 0 || final_n >= n)
+            n += abs(final_n - n + 1);
+        else
+            break;
+    }
+    return std::string(formatted.get());
+}
 
 #include "Types/Vector3.h"
-#include "Types/Vector3.h"
+#include "Types/Matrix3.h"
 
 using Vector3 = Vector3_t<float>;
-using Matrix3 = Vector3_t<float>; /* TODO: FIX */
+using Matrix3 = Matrix3_t<float,false>;
diff --git a/src/Types/Matrix3.h b/src/Types/Matrix3.h
new file mode 100644
index 0000000..6d0315f
--- /dev/null
+++ b/src/Types/Matrix3.h
@@ -0,0 +1,234 @@
+/*********************************************************************
+ * @file  Matrix3.h
+ * 
+ * @brief Declaration of templated Vector3_t class.
+ *********************************************************************/
+#pragma once
+#include <cassert>
+#include <memory>
+#include <type_traits> // for std::common_type<T,U>
+
+/**
+// IDEA: use struct for bool options
+// PROBLEM: such a struct will have a size of 1 byte because it is not a "base class subobject"
+//          - Might not be optimized away... have to check this empirically
+//          - i.e. this approach is good for "large" flow objects, but may be bad for gpu objects
+// SOLUTION?: make a base class that defines the opts subobject and?
+
+template<bool _is_diag=false, bool _check_diag=false>
+struct Matrix3_opts {
+    const bool inline is_diag() const { return _is_diag; }
+    const bool inline check_diag() const { return _check_diag; }
+};
+
+// Other strategies could include 
+
+// policy packs: https://stackoverflow.com/questions/21939217/combining-policy-classes-template-template-parameters-variadic-templates && https://www.modernescpp.com/index.php/policy-and-traits
+// inheritance is an interesting option here...
+
+**/
+
+template<typename T, bool is_diag=false, bool check_diag=false>
+struct MY_ALIGN(16*sizeof(T)) Matrix3_t  {
+    using Matrix3 = Matrix3_t<T,is_diag,check_diag>;
+    using Vector3 = Vector3_t<T>;
+
+    HOST DEVICE inline Matrix3_t() { (*this) = Matrix3(1); }
+    HOST DEVICE Matrix3_t(T s) { (*this) = Matrix3(s,s,s); }
+    HOST DEVICE Matrix3_t(T x, T y, T z) { (*this) = Matrix3( x, 0, 0, 0, y, 0, 0, 0, z); }
+    HOST DEVICE Matrix3_t(const T* d) { (*this) = Matrix3(d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], d[8]); }
+    HOST DEVICE Matrix3_t(const Vector3& ex, const Vector3& ey, const Vector3& ez) { (*this) = Matrix3(ex.x, ex.y, ex.z, ey.x, ey.y, ey.z, ez.x, ez.y, ez.z); }
+    HOST DEVICE Matrix3_t(T xx, T xy, T xz, T yx, T yy, T yz, T zx, T zy, T zz) : xx(xx), xy(xy), xz(xz), yx(yx), yy(yy), yz(yz), zx(zx), zy(zy), zz(zz) { diag_check(); }
+
+    HOST DEVICE inline void diag_check() const {
+	// if (opts.check_diag() && opts.is_diag()) {}
+	if (check_diag && is_diag) {
+	    assert(xy == xz == yx == yz == zx == zy == 0);
+	    // assert(xy == 0 && xz == 0 &&
+	    // 	   yx == 0 && yz == 0 &&
+	    // 	   zx == 0 && zy == 0);
+	}
+    }
+	
+    // Operators
+    template<typename U>
+	HOST DEVICE inline auto operator*(U s) const {
+	Matrix3_t<std::common_type_t<T,U>, is_diag, check_diag> m;
+	m.xx = s*xx; m.xy = s*xy; m.xz = s*xz;
+	m.yx = s*yx; m.yy = s*yy; m.yz = s*yz;
+	m.zx = s*zx; m.zy = s*zy; m.zz = s*zz;
+	return m;
+    }
+
+    template<typename U>
+	HOST DEVICE inline auto operator*(const Vector3_t<U>& v) const { return this->transform(v); }
+
+    template<typename U, bool is_diag2, bool check_diag2>
+	HOST DEVICE inline auto operator*(const Matrix3_t<U,is_diag2,check_diag2>& m) const { return this->transform(m); }
+
+    template<typename U, bool is_diag2, bool check_diag2>
+	HOST DEVICE inline auto operator+(const Matrix3_t<U,is_diag2,check_diag2>& m) const {
+	Matrix3_t<std::common_type_t<T,U>, is_diag && is_diag2, check_diag||check_diag2 > ret;
+	ret.xx = xx+m.xx;
+	ret.yy = yy+m.yy;
+	ret.zz = zz+m.zz;
+
+	if ( (!is_diag) || (!is_diag2) ) {
+	    ret.xy = xy+m.xy; ret.xz = xz+m.xz;
+	    ret.yx = yx+m.yx; ret.yz = yz+m.yz;
+	    ret.zx = zx+m.zx; ret.zy = zy+m.zy;
+	}	    
+	return ret;
+    }
+
+    template<typename U, bool is_diag2, bool check_diag2>
+	HOST DEVICE inline auto operator-(const Matrix3_t<U,is_diag2,check_diag2>& m) const { return (*this)+(-m); }
+
+	HOST DEVICE inline Matrix3 operator-() const { return Matrix3(-xx,-xy,-xz,-yx,-yy,-yz,-zx,-zy,-zz); }
+    HOST DEVICE inline Matrix3 transpose() const { return Matrix3(xx,yx,zx,xy,yy,zy,xz,yz,zz); }
+
+    template<typename U>
+	HOST DEVICE inline auto transform(const Vector3_t<U>& v) const {
+	Vector3_t<std::common_type_t<T,U>> w;
+	if (is_diag) {
+	    w.x = xx*v.x;
+	    w.y = yy*v.y;
+	    w.z = zz*v.z;
+	} else {
+	    w.x = xx*v.x + xy*v.y + xz*v.z;
+	    w.y = yx*v.x + yy*v.y + yz*v.z;
+	    w.z = zx*v.x + zy*v.y + zz*v.z;
+	}
+	return w;
+    }
+
+    template<typename U, bool is_diag2, bool check_diag2>
+	HOST DEVICE inline auto transform(const Matrix3_t<U,is_diag2,check_diag2>& m) const {
+	Matrix3_t<std::common_type_t<T,U>, is_diag && is_diag2, check_diag||check_diag2 > ret;
+	ret.xx = xx*m.xx + xy*m.yx + xz*m.zx;
+	ret.yy = yx*m.xy + yy*m.yy + yz*m.zy;
+	ret.zz = zx*m.xz + zy*m.yz + zz*m.zz;
+
+	if ( (!is_diag) || (!is_diag2) ) {
+	    ret.yx = yx*m.xx + yy*m.yx + yz*m.zx;
+	    ret.zx = zx*m.xx + zy*m.yx + zz*m.zx;
+
+	    ret.xy = xx*m.xy + xy*m.yy + xz*m.zy;
+	    ret.zy = zx*m.xy + zy*m.yy + zz*m.zy;
+
+	    ret.xz = xx*m.xz + xy*m.yz + xz*m.zz;
+	    ret.yz = yx*m.xz + yy*m.yz + yz*m.zz;
+	}	    
+	return ret;
+    }
+
+    HOST DEVICE
+	Matrix3 inverse() const {
+	Matrix3 m;
+	if (is_diag) {
+	    return Matrix3(T(1.0)/xx,T(1.0)/yy,T(1.0)/zz);
+	} else {
+	    T det = this->det();
+	    return Matrix3( (yy*zz-yz*zy)/det,-(xy*zz-xz*zy)/det, (xy*yz-xz*yy)/det,
+			   -(yx*zz-yz*zx)/det, (xx*zz-xz*zx)/det,-(xx*yz-xz*yx)/det,
+			    (yx*zy-yy*zx)/det,-(xx*zy-xy*zx)/det, (xx*yy-xy*yx)/det );
+	}
+    }
+	
+    HOST DEVICE
+        T det() const { if (is_diag) {
+	    return xx*yy*zz;
+	} else {
+	    return xx*(yy*zz-yz*zy) - xy*(yx*zz-yz*zx) + xz*(yx*zy-yy*zx);
+	}
+    }
+
+    //Han-Yi Chou
+    HOST DEVICE inline Matrix3 normalized() const {                
+	Vector3 x = this->ex();
+	Vector3 y = this->ey();
+	/*
+	  x = x / x.length();
+	  float error = x.dot(y);
+	  y = y-(error*x);
+	  y = y / y.length();
+	  Vector3 z = x.cross(y);
+	  z = z / z.length();*/
+	//x = (0.5*(3-x.dot(x)))*x; /* approximate normalization */
+	//y = (0.5*(3-y.dot(y)))*y; 
+	//z = (0.5*(3-z.dot(z)))*z; 
+	//return Matrix3(x,y,z);		
+	Vector3 z = x.cross(y);
+	T l;
+	l = z.length();
+	z = l > 0 ? z/l : Vector3(T(0));
+	l = x.length();
+	x = l > 0 ? x/l : Vector3(T(0));
+	y = z.cross(x);
+	l = y.length();
+	y = l > 0 ? y/l : Vector3(T(0));
+	return Matrix3(x,y,z);
+    }
+	
+    HOST DEVICE inline Vector3 ex() const { return Vector3(xx,yx,zx); }
+    HOST DEVICE inline Vector3 ey() const { return Vector3(xy,yy,zy); }
+    HOST DEVICE inline Vector3 ez() const { return Vector3(xz,yz,zz); }
+
+    auto to_string() const {
+	return string_format("%2.8f %2.8f %2.8f\n%2.8f %2.8f %2.8f\n%2.8f %2.8f %2.8f", xx,xy,xz,yx,yy,yz,zx,zy,zz);
+	// return std::format("\n{} {} {}\n{} {} {}\n{} {} {}", xx,xy,xz,yx,yy,yz,zx,zy,zz);
+	// return std::format("\n{} {} {}\n{} {} {}\n{} {} {}", xx,xy,xz,yx,yy,yz,zx,zy,zz);
+	// char s[256];
+	// sprintf(s, "%2.8f %2.8f %2.8f\n%2.8f %2.8f %2.8f\n%2.8f %2.8f %2.8f",
+	// 	xx, xy, xz, yx, yy, yz, zx, zy, zz);
+	// s[255] = 0;
+	// return std::string(s);
+    }
+
+    HOST DEVICE inline bool is_diagonal() const { return is_diag; }
+
+    template<typename U>
+	HOST DEVICE inline bool operator==(U b) const {
+	return xx == b.xx && xy == b.xy && xz == b.xz &&
+	    yx == b.yx && yy == b.yy && yz == b.yz &&
+	    zx == b.zx && zy == b.zy && zz == b.zz;
+    }
+
+    // Helper function for testing
+    template<typename U> void test_equal(const U&& b) const {
+	CHECK( xx == b.xx );
+	CHECK( xy == b.xy );
+	CHECK( xz == b.xz );
+	CHECK( yx == b.yx );
+	CHECK( yy == b.yy );
+	CHECK( yz == b.yz );
+	CHECK( zx == b.zx );
+	CHECK( zy == b.zy );
+	CHECK( zz == b.zz );
+    }
+    
+    T xx, xy, xz;
+    T yx, yy, yz;
+    T zx, zy, zz;
+};
+
+// template<typename ...Args>
+// HOST std::ostream& operator << ( std::ostream& os, Matrix3_t<Args...> const& value ) {
+//     printf("MATRIX OP\n");
+//     os << value.to_string();
+//     return os;
+// }
+
+
+/* template<typename U> */
+/* HOST DEVICE friend inline Matrix3 operator*(float s, Matrix3 m) { return m*s; } */
+/* HOST DEVICE friend inline Matrix3 operator/(Matrix3 m, float s) { */
+/* 	const float sinv = 1.0f/s; */
+/* 	return m*sinv; */
+/* } */
+
+    // HOST DEVICE void setIsDiag() {
+    // 	isDiag = (xy == 0 && xz == 0 &&
+    // 		  yx == 0 && yz == 0 &&
+    // 		  zx == 0 && zy == 0) ? true : false;
+    // }
diff --git a/src/Types/Vector3.h b/src/Types/Vector3.h
index 52345b3..d500311 100644
--- a/src/Types/Vector3.h
+++ b/src/Types/Vector3.h
@@ -4,6 +4,7 @@
  * @brief Declaration of templated Vector3_t class.
  *********************************************************************/
 #pragma once
+#include <memory>
 #include <type_traits> // for std::common_type<T,U>
 
 /**
@@ -18,11 +19,11 @@ g */
 template<typename T>
 class MY_ALIGN(4*sizeof(T)) Vector3_t {
 public:
-	HOST DEVICE inline Vector3_t<T>() : x(0), y(0), z(0), w(0) {}
+    HOST DEVICE inline Vector3_t<T>() : x(T(0)), y(T(0)), z(T(0)), w(T(0)) {}
 	HOST DEVICE inline Vector3_t<T>(T s):x(s), y(s), z(s), w(s) {}
 	HOST DEVICE inline Vector3_t<T>(const Vector3_t<T>& v):x(v.x), y(v.y), z(v.z), w(v.w)  {}
 	HOST DEVICE inline Vector3_t<T>(T x0, T y0, T z0) : x(x0), y(y0), z(z0), w(0) {}
-	HOST DEVICE inline Vector3_t<T>(const T* d) : x(d[0]), y(d[1]), z(d[2]), w(0) {}
+	// HOST DEVICE inline Vector3_t<T>(const T* d) : x(d[0]), y(d[1]), z(d[2]), w(0) {}
         HOST DEVICE inline Vector3_t<T>(const float4 a) : x(a.x), y(a.y), z(a.z), w(a.w) {}
 
 #if __cplusplus >= 201703L
@@ -130,8 +131,8 @@ public:
 	}
 
 	template<typename U>
-	HOST DEVICE inline Vector3_t<std::common_type_t<T,U>> operator/(U&& s) const {
-	    const U inv = static_cast<U>(1.0)/s;
+	HOST DEVICE inline Vector3_t<std::common_type_t<T,U>> operator/(U& s) const {
+	    const U inv = U(1.0)/s;
 	    return (*this)*inv;
 	}
 
@@ -175,14 +176,19 @@ public:
 		printf("%0.3f %0.3f %0.3f\n", x,y,z);
 	}
 
-	T x, y, z, w; //append a member w	
-
-	char* to_char() const {
-	    char* s = new char[128];
-	    sprintf(s, "%.10g %.10g %.10g", x, y, z);
+	auto to_string() const {
+	    char s[128];
+	    sprintf(s, "%.10g %.10g %.10g (%.10g)", x, y, z, w);
 	    s[127] = 0;
-	    return s;
+	    return std::string(s);
+	}
+
+	template<typename U>
+	    HOST DEVICE inline bool operator==(U b) const {
+	    return x == b.x && y == b.y && z == b.z && w == b.w;
 	}
+
+	T x, y, z, w; //append a member w	
 };
 
 
@@ -202,7 +208,7 @@ public:
 // }
 
 template<typename T, typename U>
-HOST DEVICE inline auto operator/(U&& s, Vector3_t<T>&& v) {
+HOST DEVICE inline auto operator/(const U&& s, const Vector3_t<T>&& v) {
     // TODO, throw exception if int
     using TU = typename std::common_type<T,U>::type;
     Vector3_t<TU> ret;
@@ -212,6 +218,26 @@ HOST DEVICE inline auto operator/(U&& s, Vector3_t<T>&& v) {
     return ret;
 }
 
+// template<typename T, typename U>
+// HOST DEVICE inline auto operator*(const U&& s, const Vector3_t<T>&& v) {
+//     return v*s;
+// }
+// template<typename T, typename U>
+// HOST DEVICE inline auto operator*(const U& s, const Vector3_t<T>& v) {
+//     return v*s;
+// }
+
+// template<typename T>
+// HOST DEVICE inline auto operator*(const float&& s, const Vector3_t<T>&& v) {
+//     return v*s;
+// }
+
+template<typename T>
+HOST DEVICE inline auto operator*(const float& s, const Vector3_t<T>& v) {
+    return v*s;
+}
+
+// Provide common type for vectors
 namespace std {
     template<typename T,typename U>
     struct common_type<Vector3_t<T>, Vector3_t<U>> {
-- 
GitLab