diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4c3fa1e1bfafd21b082c7bfcdbea63bed93a9038
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library("lib${PROJECT_NAME}"
+  GPUManager.cpp
+  ParticlePatch.cpp
+  Integrator.cpp
+  Integrator/CPU.cpp
+  Integrator/CUDA.cu
+  Interaction.cpp
+  Interaction/CPU.cpp
+  Interaction/CUDA.cu
+  SignalManager.cpp
+)
diff --git a/src/Integrator/kernels.h b/src/Integrator/kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..29c7412bba618081d7c66ae7cba08acac2d37081
--- /dev/null
+++ b/src/Integrator/kernels.h
@@ -0,0 +1,25 @@
+#pragma once
+
+// #include "../useful.h"
+#include "../Types.h"
+
+#ifdef __CUDACC__
+    #define HOST __host__
+    #define DEVICE __device__
+#else
+    #define HOST
+    #define DEVICE
+#endif
+
+namespace IntegratorKernels {
+    HOST DEVICE  void __inline__ BDIntegrate() {
+	// std::cout << "Computes::BDIntegrate_inline" << std::endl;
+	printf("Integrator::BDIntegrate\n");
+    };
+
+    HOST DEVICE  void __inline__ BDIntegrate(Vector3* __restrict__ pos, const Vector3 * const __restrict__ force, const int& idx, float& root_Dt, Vector3& normal_sample_3D) {
+	printf("Integrator::BDIntegrate\n");
+	pos[idx] = pos[idx] + force[idx] * root_Dt + normal_sample_3D;
+    };
+
+}
diff --git a/src/Interaction.cpp b/src/Interaction.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0ec6e2d72bbb7b5db747f83f5281c250f7cb3cc6
--- /dev/null
+++ b/src/Interaction.cpp
@@ -0,0 +1,70 @@
+#include "Interaction.h"
+
+bool operator<(const LocalInteraction::Conf x, const LocalInteraction::Conf y) { return (int(x) < int(y)); };
+std::map<LocalInteraction::Conf, LocalInteraction*> LocalInteraction::_interactions;
+	
+LocalInteraction* LocalInteraction::GetInteraction(Conf& conf) {
+	// Checks _interactions for a matching configuration, returns one if found, otherwise creates
+	if (conf.backend == Conf::Default) {
+#ifdef USE_CUDA
+	    conf.backend = Conf::CUDA;
+#else
+	    conf.backend = Conf::CPU;
+#endif
+	}
+
+	// Insert configuration into map, if it exists 
+	auto emplace_result = LocalInteraction::_interactions.emplace(conf, nullptr);
+	auto& it = emplace_result.first;
+	bool& inserted = emplace_result.second;
+	if (inserted) {
+	    // Conf not found, so create a new one 
+ 	    LocalInteraction* tmp;
+
+	    switch (conf.object_type) {
+	    case Conf::Particle:
+		switch (conf.dof) {
+		case Conf::Bond:
+		    switch (conf.form) {
+		    case Conf::Harmonic:
+			switch (conf.backend) {
+			case Conf::CUDA:
+#ifdef USE_CUDA
+			    tmp = new LocalBondedCUDA();
+#else
+			    std::cerr << "WARNING: LocalInteraction::GetInteraction: "
+				      << "CUDA disabled, creating CPU interaction instead" << std::endl;
+			    tmp = new LocalBonded();
+#endif
+			    break;
+			case Conf::CPU:
+			    tmp = new LocalBonded();
+			    break;
+			default:
+			    std::cerr << "Error: LocalInteraction::GetInteraction: "
+				      << "Unrecognized backend; exiting" << std::endl;
+			}
+		    default:
+			std::cerr << "Error: LocalInteraction::GetInteraction: "
+				  << "Unrecognized form; exiting" << std::endl;
+			assert(false);
+		    }
+		    break;
+		default:
+		    std::cerr << "Error: Interaction::GetInteraction: "
+			      << "Unrecognized algorithm type; exiting" << std::endl;
+		    assert(false);
+		}
+		break;
+	    case Conf::RigidBody:
+		assert(false);
+		break;
+	    default:
+		std::cerr << "Error: Interaction::GetInteraction: "
+			  << "Unrecognized object type; exiting" << std::endl;
+		assert(false);
+	    }
+	    it->second = tmp;
+	}
+	return it->second;
+}
diff --git a/src/Interaction/CPU.cpp b/src/Interaction/CPU.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39d4bd1b192ec82e1efda4a2fdad8e6d47c8f480
--- /dev/null
+++ b/src/Interaction/CPU.cpp
@@ -0,0 +1,6 @@
+#include "CPU.h"
+
+void LocalBonded::compute(Patch* p) {
+    std::cout << "LocalBonded::compute()" << std::endl;
+    InteractionKernels::HarmonicBonds();
+};
diff --git a/src/Interaction/CPU.h b/src/Interaction/CPU.h
new file mode 100644
index 0000000000000000000000000000000000000000..777a59bc11ea4d627fe57c5db379f0c786734dc6
--- /dev/null
+++ b/src/Interaction/CPU.h
@@ -0,0 +1,8 @@
+#pragma once
+#include "../Interaction.h"
+
+class LocalBonded : public LocalInteraction {
+public:
+    void compute(Patch* patch);
+    int num_patches() const { return 1; };
+};
diff --git a/src/Interaction/CUDA.cu b/src/Interaction/CUDA.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a89fdd8dc88caca6a6453199d55b4c80867ea416
--- /dev/null
+++ b/src/Interaction/CUDA.cu
@@ -0,0 +1,15 @@
+#include "CUDA.h"
+
+#ifdef USE_CUDA
+__global__ void HarmonicBonds_kernel() {
+    if (threadIdx.x == 0) {
+	printf("HarmonicBonds_kernel()\n");
+	InteractionKernels::HarmonicBonds();
+    }
+};
+
+void LocalBondedCUDA::compute(Patch* p) {
+    printf("HarmonicBondsCUDA::compute()\n");
+    HarmonicBonds_kernel<<<1,32>>>();
+};
+#endif
diff --git a/src/Interaction/CUDA.h b/src/Interaction/CUDA.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a3e9e53bcd82b054ba8d584856987d2f5af388e
--- /dev/null
+++ b/src/Interaction/CUDA.h
@@ -0,0 +1,10 @@
+#pragma once
+#include "../Interaction.h"
+
+#ifdef USE_CUDA
+class LocalBondedCUDA : public LocalInteraction {
+public:
+    void compute(Patch* patch);
+    int num_patches() const { return 1; };
+};
+#endif
diff --git a/src/Interaction/kernels.h b/src/Interaction/kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ee970cd974ea8104b87ee0e319e66a300d17bed
--- /dev/null
+++ b/src/Interaction/kernels.h
@@ -0,0 +1,24 @@
+#pragma once
+
+// #include "../useful.h"
+#include "../Types.h"
+
+#ifdef __CUDACC__
+    #define HOST __host__
+    #define DEVICE __device__
+#else
+    #define HOST
+    #define DEVICE
+#endif
+
+namespace InteractionKernels {
+    HOST DEVICE  void __inline__ HarmonicBonds() {
+	// std::cout << "Computes::BDIntegrate_inline" << std::endl;
+	printf("Interaction::HarmonicBondsDummy()\n");
+    };
+
+    HOST DEVICE  void __inline__ HarmonicBonds(Vector3* __restrict__ pos, const Vector3 * const __restrict__ force) {
+	printf("Interaction::HarmonicBonds\n");
+	// pos[idx] = pos[idx] + force[idx] * root_Dt + normal_sample_3D;
+    };
+}
diff --git a/src/Tests/CMakeLists.txt b/src/Tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f85436c0f56a0ca5bedb5692e6049d12a85105d3
--- /dev/null
+++ b/src/Tests/CMakeLists.txt
@@ -0,0 +1,31 @@
+find_package(Catch2 3 REQUIRED)
+
+# include(doctest)
+include(Catch) # contains some cmake macros that will help with discovery of tests
+include(CTest)
+
+## TODO, add a macro to add these projects, build with CUDA
+add_executable("arbd_tests"
+vector3_precision.cu
+)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --extended-lambda")
+set_property(TARGET arbd_tests PROPERTY CXX_STANDARD 14)
+set_property(TARGET arbd_tests PROPERTY CUDA_STANDARD 14)
+# target_link_libraries("arbd_tests" PRIVATE "lib${PROJECT_NAME}" Catch2::Catch2)
+target_link_libraries("arbd_tests" PRIVATE "lib${PROJECT_NAME}" Catch2::Catch2WithMain)
+
+## Add optional libraries
+if(USE_CUDA)
+  target_link_libraries(arbd_tests PUBLIC curand)
+endif()
+if(USE_NCCL)
+  target_link_libraries(arbd_tests PUBLIC nccl)
+endif()
+if(USE_NVTX)
+  target_link_libraries(arbd_tests PUBLIC nvToolsExt)
+endif()
+
+## catch_discover_tests("${PROJECT_NAME}_TESTS")
+
+catch_discover_tests(arbd_tests)
+install(TARGETS arbd_tests)
diff --git a/src/Tests/type_name.h b/src/Tests/type_name.h
new file mode 100644
index 0000000000000000000000000000000000000000..90e83093f160191ed082236f36f57801a746e3c5
--- /dev/null
+++ b/src/Tests/type_name.h
@@ -0,0 +1,33 @@
+#include <type_traits>
+#include <typeinfo>
+#ifndef _MSC_VER
+#include <cxxabi.h>
+#endif
+#include <memory>
+#include <string>
+#include <cstdlib>
+
+template <typename T>
+std::string type_name() {
+    using TR = typename std::remove_reference<T>::type;
+    std::unique_ptr<char, void(*)(void*)> own
+           (
+#ifndef _MSC_VER
+                abi::__cxa_demangle(typeid(TR).name(), nullptr,
+                                           nullptr, nullptr),
+#else
+                nullptr,
+#endif
+                std::free
+           );
+    std::string r = own != nullptr ? own.get() : typeid(TR).name();
+    if (std::is_const<TR>::value)
+        r += " const";
+    if (std::is_volatile<TR>::value)
+        r += " volatile";
+    if (std::is_lvalue_reference<T>::value)
+        r += "&";
+    else if (std::is_rvalue_reference<T>::value)
+        r += "&&";
+    return r;
+}
diff --git a/src/Tests/vector3_precision.cu b/src/Tests/vector3_precision.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8fba1f9580e6f5d181fbc2250940bc8a6b918262
--- /dev/null
+++ b/src/Tests/vector3_precision.cu
@@ -0,0 +1,194 @@
+#include <float.h>
+#include <iostream>
+#include <cstdio>
+
+// #include "useful.h"
+#include "../SignalManager.h"
+#include "../Types.h"
+#include <cuda.h>
+
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+#include "type_name.h"
+
+
+// Test kernel function
+// namespace TestVectorOps {
+
+// ChatGPT("Add doxygen to each function, class and struct, where R represents a return parameter, T and U")
+
+template<typename R, typename T, typename U>
+HOST DEVICE R cross_vectors(T&& v1, U&& v2) {
+    return v1.cross(v2);
+}
+
+template<typename F, typename R, typename T, typename U>
+__global__ void binary_op_test_kernel( F fn, R* result, T in1, U in2 ) {
+    if (blockIdx.x == 0) {
+	*result = (*fn)(in1,in2);
+    }
+}
+
+template<typename T, typename U>
+void check_vectors_equal( T&& cpu, U&& gpu) {
+    CHECK( type_name<decltype(cpu)>() == type_name<decltype(gpu)>() ); // should be unneccesary
+    CHECK( cpu.x == gpu.x );
+    CHECK( cpu.y == gpu.y );
+    CHECK( cpu.z == gpu.z );
+    CHECK( cpu.w == gpu.w );
+}
+
+template<typename R, typename T, typename U>
+struct TestOp {
+    std::string name;
+    typename R (*)(T,U) func;
+};
+
+
+TEST_CASE( "Check that Vector3_t binary operations are identical on GPU and CPU", "[Vector3Dcross]" ) {
+    // INFO("Test case start");
+    using T = Vector3_t<float>;
+    using U = Vector3_t<double>;
+    using R = std::common_type_t<T,U>;
+    
+    T v1(1,0,0);
+    U v2(0,2,0);
+    R *gpu_result_d, gpu_result, cpu_result;
+
+    
+    std::vector<TestOp> ops;
+    ops.push_back(TestOp("add", [] __host__ __device__ (T a, U b){ return static_cast<R>(b+a); }; ));
+    ops.push_back(TestOp("cross", [] __host__ __device__ (T a, U b){ return a.cross(b); }; ));
+    // auto sub = [] __host__ __device__ (T& a, U& b){ return a.cross(b); };
+    // auto dot = [] __host__ __device__ (T& a, U& b){ return a.cross(b); };
+    auto cross = [] __host__ __device__ (T a, U b){ return a.cross(b); };
+
+    using L = decltype(add);
+    std::vector<R(*)(T,U)> lambdas;
+    lambdas.push_back(add);
+    lambdas.push_back(cross);
+    
+    // Get_result
+    cudaMalloc((void **)&gpu_result_d, sizeof(R));
+
+    for (auto& op: ops) {
+	INFO(op.name);
+	binary_op_test_kernel<<<1,1>>>(op.func, gpu_result_d, v1, v2);
+	cudaMemcpy(&gpu_result, gpu_result_d, sizeof(R), cudaMemcpyDeviceToHost);
+	cudaDeviceSynchronize();
+
+	// Get cpu_result
+	cpu_result = fn(v1,v2); // rvalue version failed on cpu, but works on gpu
+
+	// Check consistency
+	check_vectors_equal(cpu_result, gpu_result);
+    }
+}
+
+// int main(int argc, char* argv[]) {
+//     return 0;
+// }
+
+// int main(int argc, char* argv[]) {
+//     SignalManager::manage_segfault();
+//     Vector3_t<float>  v1(1,0,0);
+//     Vector3_t<double> v2(0,2,0);
+//     v2.y = 6.62606957e-44;
+    
+//     std::cout << "test_vector3.cpp" << std::endl;
+//     // // std::cout << v1.to_char() << std::endl;
+    
+//     // std::cout << v1.to_char() << " x " << v2.to_char() <<
+//     // 	" = " << (v1.cross(v2)).to_char() << std::endl;
+//     // std::cout << (-v2).to_char() << " x " << v1.to_char() <<
+//     // 	" = " << (-v2.cross(v1)).to_char() << std::endl;
+
+//     // std::cout.precision(17);
+//     // std::cout << sizeof(v2.cross(v1).z) << " " << sizeof(v1.cross(v2).z) << std::endl;
+//     // std::cout << (v2.cross(v1).z) << " " << (v1.cross(v2).z) << std::endl;
+//     // std::cout << float(6.62606957e-44) << std::endl;
+//     // std::cout << " ... done" << std::endl;
+
+//     // test_kernel<<<1,1>>>(v1,v2);
+//     // cudaDeviceSynchronize();
+//     return 0;
+// }
+
+// #include <iostream>
+
+// // Since C++ 11
+// // template<typename T>
+// // using func_t = T (*) (T, T);
+
+// template<typename R, typename T, typename U>
+// using func_t = R (*) (T, U);
+
+// template <typename R, typename T, typename U> 
+// __host__ __device__ R add_func (T x, U y)
+// {
+//     return x + y;
+// }
+
+// template <typename T> 
+// __host__ __device__ T mul_func (T x, T y)
+// {
+//     return x * y;
+// }
+
+// // Required for functional pointer argument in kernel function
+// // Static pointers to device functions
+// template <typename R, typename T, typename U> 
+// __device__ func_t<R,T,U> p_add_func = add_func<R,T,U>;
+// // template <typename T> 
+// // __device__ func_t<T> p_mul_func = mul_func<T>;
+
+
+// template <typename R, typename T, typename U> 
+// __global__ void kernel(func_t<R,T,U> op, T * d_x, U * d_y, R * result)
+// {
+//     *result = (*op)(*d_x, *d_y);
+// }
+
+// template <typename T, typename U> 
+// void mytest(T x, U y)
+// {
+//     using R = std::common_type_t<T,U>;
+//     func_t<R,T,U> h_add_func;
+// //    func_t<T> h_mul_func;
+
+//     T * d_x;
+//     U * d_y;
+//     cudaMalloc(&d_x, sizeof(T));
+//     cudaMalloc(&d_y, sizeof(U));
+//     cudaMemcpy(d_x, &x, sizeof(T), cudaMemcpyHostToDevice);
+//     cudaMemcpy(d_y, &y, sizeof(U), cudaMemcpyHostToDevice);
+
+//     R result;
+//     R * d_result, * h_result;
+//     cudaMalloc(&d_result, sizeof(R));
+//     h_result = &result;
+
+//     // Copy device function pointer to host side
+//     cudaMemcpyFromSymbol(&h_add_func, p_add_func<R,T,U>, sizeof(func_t<R,T,U>));
+//     // cudaMemcpyFromSymbol(&h_mul_func, p_mul_func<T>, sizeof(func_t<T>));
+
+//     kernel<R,T,U><<<1,1>>>(h_add_func, d_x, d_y, d_result);
+//     cudaDeviceSynchronize();
+//     cudaMemcpy(h_result, d_result, sizeof(R), cudaMemcpyDeviceToHost);
+//     CHECK( result == add_func<R,T,U>(x,y) );
+    
+//     // kernel<T><<<1,1>>>(h_mul_func, d_x, d_y, d_result);
+//     // cudaDeviceSynchronize();
+//     // cudaMemcpy(h_result, d_result, sizeof(T), cudaMemcpyDeviceToHost);
+//     // CHECK( result == mul_func(x,y) );
+// }
+
+// TEST_CASE( "Check that Vector3_t binary operations are identical on GPU and CPU", "[Vector3Dcross]" ) {
+//     INFO("TEST START");
+//     mytest<int,float>(2.05, 10.00);
+//     // mytest<float>(2.05, 10.00);
+//     // mytest<double>(2.05, 10.00);
+// }
+
+// */
diff --git a/src/Types.h b/src/Types.h
new file mode 100644
index 0000000000000000000000000000000000000000..6cfa33eda7c5868d6452029cadccb4b4c3202de5
--- /dev/null
+++ b/src/Types.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#ifdef __CUDACC__
+    #define HOST __host__
+    #define DEVICE __device__
+#else
+    #define HOST
+    #define DEVICE
+#endif
+
+#if defined(__CUDACC__) // NVCC
+   #define MY_ALIGN(n) __align__(n)
+#elif defined(__GNUC__) // GCC
+  #define MY_ALIGN(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER) // MSVC
+  #define MY_ALIGN(n) __declspec(align(n))
+#else
+  #error "Please provide a definition for MY_ALIGN macro for your host compiler!"
+#endif
+
+#ifdef USE_CUDA
+#include <cuda_runtime.h>
+#else
+
+struct MY_ALIGN(16) float4 {
+    float4() : x(0), y(0), z(0), w(0) {};
+    float4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {};
+    float4 operator+(const float4&& o) {
+	return float4(x+o.x,y+o.y,z+o.z,w+o.w);
+	// float4 r;
+	// r.x = x+o.x; r.y = y+o.y; r.z = z+o.z; r.w = w+o.w;
+	// return r;
+    };
+    float4 operator*(const float&& s) {
+	return float4(x*s,y*s,z*s,w*s);
+    };
+    
+    float x,y,z,w;
+};
+#endif
+
+
+#include "Types/Vector3.h"
+#include "Types/Vector3.h"
+
+using Vector3 = Vector3_t<float>;
+using Matrix3 = Vector3_t<float>; /* TODO: FIX */
diff --git a/src/Types/Vector3.h b/src/Types/Vector3.h
new file mode 100644
index 0000000000000000000000000000000000000000..890de64ea4938eb70b53d0ac0ddb0a9626e6f389
--- /dev/null
+++ b/src/Types/Vector3.h
@@ -0,0 +1,219 @@
+/*********************************************************************
+ * @file  Vector3.h
+ * 
+ * @brief Declaration of templated Vector3_t class.
+ *********************************************************************/
+#pragma once
+#include <type_traits> // for std::common_type<T,U>
+
+/**
+ * 3D vector utility class with common operations implemented on CPU and GPU.
+ * 
+ * Implemented with 4D data storage for better GPU alignment; extra
+ * data can be stored in fourth varaible this->w
+ *
+ * @tparam T the type of data stored in the four fields, x,y,z,w; T
+ * Should usually be float or double.
+g */
+template<typename T>
+class MY_ALIGN(4*sizeof(T)) Vector3_t {
+public:
+	HOST DEVICE inline Vector3_t<T>() : x(0), y(0), z(0), w(0) {}
+	HOST DEVICE inline Vector3_t<T>(T s):x(s), y(s), z(s), w(s) {}
+	HOST DEVICE inline Vector3_t<T>(const Vector3_t<T>& v):x(v.x), y(v.y), z(v.z), w(v.w)  {}
+	HOST DEVICE inline Vector3_t<T>(T x0, T y0, T z0) : x(x0), y(y0), z(z0), w(0) {}
+	HOST DEVICE inline Vector3_t<T>(const T* d) : x(d[0]), y(d[1]), z(d[2]), w(0) {}
+        HOST DEVICE inline Vector3_t<T>(const float4 a) : x(a.x), y(a.y), z(a.z), w(a.w) {}
+
+#if __cplusplus >= 201703L
+	// TODO: test if c++17 constexpr performs better than std::common_type, otherwise get rid of this #if block
+	template <typename U>
+	    HOST DEVICE inline Vector3_t<typename std::common_type<T,U>::type> cross(const Vector3_t<U>& w) const {
+	    if constexpr(sizeof(U) < sizeof(T)) {
+		Vector3_t<T> v;
+		v.x = y*w.z - z*w.y;
+		v.y = z*w.x - x*w.z;
+		v.z = x*w.y - y*w.x;
+		return v;
+	    } else {
+		Vector3_t<U> v;
+		v.x = y*w.z - z*w.y;
+		v.y = z*w.x - x*w.z;
+		v.z = x*w.y - y*w.x;
+		return v;
+	    }
+	}
+#else
+	template <typename U>
+	    HOST DEVICE inline Vector3_t<typename std::common_type<T,U>::type> cross(const Vector3_t<U>& w) const {
+	    using TU = typename std::common_type<T,U>::type;
+	    Vector3_t<TU> v;
+	    v.x = y*w.z - z*w.y;
+	    v.y = z*w.x - x*w.z;
+	    v.z = x*w.y - y*w.x;
+	    return v;
+	}
+#endif
+
+	HOST DEVICE inline Vector3_t<T>& operator=(const Vector3_t<T>&& v) {
+		x = v.x;
+		y = v.y;
+		z = v.z;
+		return *this;
+	}
+
+	HOST DEVICE inline Vector3_t<T>& operator+=(const Vector3_t<T>&& v) {
+		x += v.x;
+		y += v.y;
+		z += v.z;
+		return *this;
+	}
+
+	HOST DEVICE inline Vector3_t<T>& operator-=(const Vector3_t<T>&& v) {
+		x -= v.x;
+		y -= v.y;
+		z -= v.z;
+		return *this;
+	}
+
+	HOST DEVICE inline Vector3_t<T>& operator*=(float s) {
+		x *= s;
+		y *= s;
+		z *= s;
+		return *this;
+	}
+
+	HOST DEVICE inline Vector3_t<T>& operator/=(float s) {
+		const float sinv = 1.0f/s;
+		x *= sinv;
+		y *= sinv;
+		z *= sinv;
+		return *this;
+	}
+
+	HOST DEVICE inline Vector3_t<T> operator-() const {
+		Vector3_t<T> v;
+		v.x = -x;
+		v.y = -y;
+		v.z = -z;
+		return v;
+	}
+
+	template<typename U>
+	    HOST DEVICE inline Vector3_t<T> operator+(const Vector3_t<U>& w ) const {
+	    using TU = typename std::common_type<T,U>::type;
+	    Vector3_t<TU> v;
+	    v.x = x + w.x;
+	    v.y = y + w.y;
+	    v.z = z + w.z;
+	    return v;
+	}
+
+	HOST DEVICE inline Vector3_t<T> operator-(const Vector3_t<T>& w ) const {
+		Vector3_t<T> v;
+		v.x = x - w.x;
+		v.y = y - w.y;
+		v.z = z - w.z;
+		return v;
+	}
+
+	template<typename U>
+	HOST DEVICE inline Vector3_t<T> operator*(U&& s) const {
+	    // TODO, throw exception if int
+	    using TU = typename std::common_type<T,U>::type;
+	    Vector3_t<TU> v;
+	    v.x = s*x;
+	    v.y = s*y;
+	    v.z = s*z;
+	    return v;
+	}
+
+	template<typename U>
+	HOST DEVICE inline Vector3_t<T> operator/(U&& s) const {
+	    const U inv = static_cast<U>(1.0)/s;
+	    return (*this)*inv;
+	}
+
+	template<typename U>
+	HOST DEVICE inline float dot(const Vector3_t<U>& w) const { return x*w.x + y*w.y + z*w.z; }
+
+	HOST DEVICE inline float length() const { return sqrtf(x*x + y*y + z*z); }
+	HOST DEVICE inline float length2() const { return x*x + y*y + z*z; }
+	HOST DEVICE inline float rLength() const {
+		float l = length();
+		if (l != 0.0f)
+			return 1.0f / l;
+		return 0.0f;
+	}
+
+	HOST DEVICE inline float rLength2() const {
+		float l2 = length2();
+		if (l2 != 0.0f) return 1.0f / l2;
+		return 0.0f;
+	}
+
+	template<typename U>
+	HOST DEVICE static inline auto element_mult(const Vector3_t<T>&& v, const Vector3_t<U>&& w) {
+	    using TU = typename std::common_type<T,U>::type;
+		Vector3_t<TU> ret(
+			v.x*w.x,
+			v.y*w.y,
+			v.z*w.z);
+		return ret;
+	}
+
+	HOST DEVICE static inline Vector3_t<T> element_sqrt(const Vector3_t<T>&& w) {
+		Vector3_t<T> ret(
+			sqrt(w.x),
+			sqrt(w.y),
+			sqrt(w.z));
+		return ret;
+	}
+
+	HOST DEVICE inline void print() const {
+		printf("%0.3f %0.3f %0.3f\n", x,y,z);
+	}
+
+	T x, y, z, w; //append a member w	
+
+	char* to_char() const {
+	    char* s = new char[128];
+	    sprintf(s, "%.10g %.10g %.10g", x, y, z);
+	    s[127] = 0;
+	    return s;
+	}
+};
+
+
+
+/* template<typename T> */
+/* HOST DEVICE inline Vector3_t<T> operator*(float s, Vector3_t<T> v) { */
+/* 	v.x *= s; */
+/* 	v.y *= s; */
+/* 	v.z *= s; */
+/* 	return v; */
+/* } */
+
+// template<typename T>
+// HOST DEVICE inline Vector3_t<T> operator/(Vector3_t<T> v, float s) {
+// 	const float sinv = 1.0f/s;
+// 	return v*sinv;
+// }
+
+template<typename T, typename U>
+HOST DEVICE inline auto operator/(U&& s, Vector3_t<T>&& v) {
+    // TODO, throw exception if int
+    using TU = typename std::common_type<T,U>::type;
+    Vector3_t<TU> ret;
+    ret.x = s / v.x;
+    ret.y = s / v.y;
+    ret.z = s / v.z;
+    return ret;
+}
+
+namespace std {
+    template<typename T,typename U>
+    struct common_type<Vector3_t<T>, Vector3_t<U>> {
+	using type = Vector3_t<common_type_t<T,U>>;
+    };
+}