diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..eae98cd748ea4690e5c8128fe95223ce9534a127
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "extern/spdlog"]
+	path = extern/spdlog
+	url = https://github.com/gabime/spdlog.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9aee9f94644d0133f1fc1dea0a6559506df4d9b6..9109ba62247dd177182d6f9721bf16d0b9bfaefa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,11 +64,13 @@ endif()
 option(DEBUG "Build with debug flags" False)
 option(USE_NCCL "Use NCCL for single node GPU peer communication" False)
 option(USE_NVTX "Build with NVTX profiling ranges" False)
+option(USE_LOGGER "Build with SPDLog" True)
 
 # (not yet optional) message(STATUS "USE_CUDA: ${USE_CUDA}")
 message(STATUS "DEBUG: ${DEBUG}")
 message(STATUS "USE_NCCL: ${USE_NCCL}")
 message(STATUS "USE_NVTX: ${USE_NVTX}")
+message(STATUS "USE_LOGGER: ${USE_LOGGER}")
 
 
 ## Set flags before adding executable 
@@ -96,6 +98,28 @@ if(USE_NCCL)
   add_definitions(-DUSE_NCCL)
   target_link_libraries("${PROJECT_NAME}" PRIVATE nccl)
 endif()
+if(USE_LOGGER)
+  # set(spdlog_DIR "extern/spdlog/include/spdlog")
+  # set(CMAKE_spdlog_DIR "${spdlog_DIR}")
+  # find_package(spdlog REQUIRED)
+  set(spdlog_DIR "extern/spdlog/include")
+  set(CMAKE_spdlog_DIR "${spdlog_DIR}")
+  add_subdirectory(extern/spdlog)
+  # find_package(spdlog REQUIRED)
+  include_directories(${spdlog_DIR})
+  set(SPDLOG_LEVEL SPDLOG_LEVEL_DEBUG)
+
+  # target_include_directories("lib${PROJECT_NAME}" PRIVATE ${spdlog_DIR})
+  # target_include_directories("${PROJECT_NAME}" PRIVATE ${spdlog_DIR})
+
+  # if(DEFINED ENV{CUDA_INCLUDE_DIRS})
+  #   set(CUDA_INCLUDE_DIRS $ENV{CUDA_INCLUDE_DIRS})
+  # endif()
+  # target_link_libraries("${PROJECT_NAME}" PRIVATE spdlog::spdlog_header_only)
+  # target_link_libraries("lib${PROJECT_NAME}" PRIVATE spdlog::spdlog_header_only)	
+  # target_link_libraries("lib${PROJECT_NAME}" spdlog)	
+  # target_link_libraries("${PROJECT_NAME}" spdlog)	
+endif()
 
 ## Two lines below needed?
 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
@@ -147,5 +171,12 @@ endif()
 if(USE_NVTX)
   target_link_libraries("${PROJECT_NAME}" PUBLIC nvToolsExt)
 endif()
+if(USE_LOGGER)
+  add_definitions(-DUSE_LOGGER)
+  # include_directories(${spdlog_DIR})
+  target_include_directories("${PROJECT_NAME}" PRIVATE ${spdlog_DIR})
+  target_link_libraries("${PROJECT_NAME}" PRIVATE spdlog::spdlog_header_only)
+  add_compile_definitions(SPDLOG_ACTIVE_LEVEL=${SPDLOG_LEVEL})
+endif()
 
 install(TARGETS "${PROJECT_NAME}")
diff --git a/extern/spdlog b/extern/spdlog
new file mode 160000
index 0000000000000000000000000000000000000000..1ef8d3ce348daf5d580e27fc68e91628ce42c1f4
--- /dev/null
+++ b/extern/spdlog
@@ -0,0 +1 @@
+Subproject commit 1ef8d3ce348daf5d580e27fc68e91628ce42c1f4
diff --git a/src/ARBDException.h b/src/ARBDException.h
index 810c143a8579061ed05ba4aad3041c8723535f79..d310a33857ff422f8da52359beeffe87f55fc539 100644
--- a/src/ARBDException.h
+++ b/src/ARBDException.h
@@ -9,6 +9,7 @@
 #include <string>
 #include <cstdarg>
 #include <exception>
+#include "SignalManager.h"
 
 enum ExceptionType {
     UnspeficiedError,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index db6290e506a3458881fe33be5e05a201017a16b7..742832a7e83fa4fefc7c2117f96873e470694a01 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,3 +1,4 @@
+
 add_library("lib${PROJECT_NAME}"
   ARBDException.cpp
   GPUManager.cpp
@@ -11,3 +12,11 @@ add_library("lib${PROJECT_NAME}"
   SignalManager.cpp
   PatchOp.cu
 )
+if(USE_LOGGER)
+  add_definitions(-DUSE_LOGGER)
+  target_include_directories("lib${PROJECT_NAME}" PRIVATE ${spdlog_DIR})
+  target_link_libraries("lib${PROJECT_NAME}" PRIVATE spdlog::spdlog_header_only)	
+  # target_link_libraries("lib${PROJECT_NAME}" spdlog)	
+  # target_link_libraries("${PROJECT_NAME}" spdlog)
+  add_compile_definitions(SPDLOG_ACTIVE_LEVEL=${SPDLOG_LEVEL})
+endif()
diff --git a/src/GPUManager.cpp b/src/GPUManager.cpp
index 45a9faec474903e7534dee762f49ac3c037fadd2..3f8dab9920567b7edbcb85e3a1ec849e11158fd3 100644
--- a/src/GPUManager.cpp
+++ b/src/GPUManager.cpp
@@ -20,17 +20,17 @@ std::vector<GPU> GPUManager::allGpus, GPUManager::gpus, GPUManager::notimeouts;
 GPU::GPU(unsigned int id) : id(id) {
     cudaSetDevice(id);
     cudaGetDeviceProperties(&properties, id);
-    printf("[%d] %s ", id, properties.name);
+    char* timeout_str = "";
     if (properties.kernelExecTimeoutEnabled) {
-	printf("(may timeout) ");
+	timeout_str = "(may timeout) ";
 	may_timeout = true;
     } else {
 	may_timeout = false;
     }
-    printf("| SM %d.%d, ", properties.major, properties.minor);
-    printf("%.2fGHz, ", (float) properties.clockRate * 10E-7);
-    printf("%.1fGB RAM\n", (float) properties.totalGlobalMem * 7.45058e-10);
-
+    INFO("[{}] {} {}| SM {}.{} {:.2f}GHz, {:.1f}GB RAM",
+	 id, properties.name, timeout_str, properties.major, properties.minor,
+	 (float) properties.clockRate * 10E-7, (float) properties.totalGlobalMem * 7.45058e-10);
+    
     streams_created = false;
     // fflush(stdout);
     // gpuErrchk( cudaDeviceSynchronize() );
@@ -59,12 +59,12 @@ void GPU::create_streams() {
 
 void GPU::destroy_streams() {
     int curr;
-    // printf("Destroying streams\n");
+    TRACE("Destroying streams");
     if (cudaGetDevice(&curr) == cudaSuccess) { // Avoid errors when program is shutting down
 	gpuErrchk( cudaSetDevice(id) );
 	if (streams_created) {
 	    for (int i = 0; i < NUMSTREAMS; i++) {
-		// printf("  destroying stream %d at %p\n", i, (void *) &streams[i]);
+		TRACE("  destroying stream {} at {}\n", i, fmt::ptr((void *) &streams[i]));
 		gpuErrchk( cudaStreamDestroy( streams[i] ) );
 	    }
 	}
@@ -76,7 +76,7 @@ void GPU::destroy_streams() {
 
 void GPUManager::init() {
     gpuErrchk(cudaGetDeviceCount(&nGPUs));
-    printf("Found %d GPU(s)\n", nGPUs);
+    INFO("Found {} GPU(s)", nGPUs);
     for (int dev = 0; dev < nGPUs; dev++) {
 	GPU g(dev);
 	allGpus.push_back(g);
@@ -84,7 +84,7 @@ void GPUManager::init() {
     }
     is_safe = false;
     if (allGpus.size() == 0) {
-	fprintf(stderr, "Error: Did not find a GPU\n");
+	Exception(ValueError, "Did not find a GPU\n");
 	exit(1);
     }
 }
@@ -96,19 +96,20 @@ void GPUManager::load_info() {
 }
 
 void GPUManager::init_devices() {
-    printf("Initializing devices... ");
+    INFO("Initializing GPU devices... ");
+    char msg[256] = "";    
     for (unsigned int i = 0; i < gpus.size(); i++) {
     	if (i != gpus.size() - 1 && gpus.size() > 1)
-    	    printf("%d, ", gpus[i].id);
+    	    sprintf(msg, "%s%d, ", msg, gpus[i].id);
     	else if (gpus.size() > 1)
-    	    printf("and %d\n", gpus[i].id);
+	    sprintf(msg, "%sand %d", msg, gpus[i].id);
     	else
-    	    printf("%d\n", gpus[i].id);
-
+    	    sprintf(msg, "%d", gpus[i].id);
     	use(i);
     	cudaDeviceSetCacheConfig( cudaFuncCachePreferL1 );
     	gpus[i].create_streams();
     }
+    INFO("Initializing GPUs: {}", msg);
     use(0);
     gpuErrchk( cudaDeviceSynchronize() );
 }
diff --git a/src/Proxy.h b/src/Proxy.h
index bec32e21995bbd15ef6c49725188294c1c45636c..78ee47dc5d8643e1dcd392ebc708a7eb37acb8dd 100644
--- a/src/Proxy.h
+++ b/src/Proxy.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <future>
 #include <iostream>
 #include "ARBDException.h"
 
@@ -10,7 +11,7 @@ struct Resource {
     /**
      * @brief Enum to specify the type of the resource (e.g., CPU or GPU).
      */
-    enum ResourceType {CPU, GPU};
+    enum ResourceType {CPU, MPI, GPU};
     ResourceType type; ///< Type of the resource.
     size_t id; ///< ID or any other identifier associated with the resource.
     // HOST DEVICE static bool is_local() { // check if thread/gpu idx matches some global idx };
@@ -56,6 +57,55 @@ public:
      */
     Resource location;	    ///< The device (thread/gpu) holding the data represented by the proxy.
     T* addr;		    ///< The address of the underlying object.
+
+    template <typename RetType, typename... Args>
+    RetType callSync(RetType (T::*memberFunc)(Args...), Args... args) {
+        switch (location.type) {
+            case Resource::CPU:
+                return (addr->*memberFunc)(args...);
+            case Resource::GPU:
+                // Handle GPU-specific logic
+                std::cerr << "Error: GPU not implemented in synchronous call." << std::endl;
+                // You may want to throw an exception or handle this case accordingly
+                return RetType{};
+            case Resource::MPI:
+                // Handle MPI-specific logic
+                std::cerr << "Error: MPI not implemented in synchronous call." << std::endl;
+                // You may want to throw an exception or handle this case accordingly
+                return RetType{};
+            default:
+                // Handle other cases or throw an exception
+                std::cerr << "Error: Unknown resource type." << std::endl;
+                // You may want to throw an exception or handle this case accordingly
+                return RetType{};
+        }
+    }
+
+    template <typename RetType, typename... Args>
+    std::future<RetType> callAsync(RetType (T::*memberFunc)(Args...), Args... args) {
+        switch (location.type) {
+            case Resource::CPU:
+                // Handle CPU-specific asynchronous logic
+                return std::async(std::launch::async, [this, memberFunc, args...] {
+                    return (addr->*memberFunc)(args...);
+                });
+            case Resource::GPU:
+                // Handle GPU-specific asynchronous logic
+                std::cerr << "Error: GPU not implemented in asynchronous call." << std::endl;
+                // You may want to throw an exception or handle this case accordingly
+                return std::async(std::launch::async, [] { return RetType{}; });
+            case Resource::MPI:
+                // Handle MPI-specific asynchronous logic
+                std::cerr << "Error: MPI not implemented in asynchronous call." << std::endl;
+                // You may want to throw an exception or handle this case accordingly
+                return std::async(std::launch::async, [] { return RetType{}; });
+            default:
+                // Handle other cases or throw an exception
+                std::cerr << "Error: Unknown resource type." << std::endl;
+                // You may want to throw an exception or handle this case accordingly
+                return std::async(std::launch::async, [] { return RetType{}; });
+        }
+    }
 };
 
 /**
@@ -99,11 +149,11 @@ HOST inline Proxy<T> _send_ignoring_children(const Resource& location, T& obj, T
  */
 template <typename T, typename Dummy = void, typename std::enable_if_t<!has_send_children<T>::value, Dummy>* = nullptr>
 HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
-    printf("Sending object %s @%x to device at %x\n", type_name<T>().c_str(), &obj, dest);
-
+    TRACE("...Sending object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     // Simple objects can simply be copied without worrying about contained objects and arrays
     auto ret = _send_ignoring_children<T>(location, obj, dest);
-    printf("...done\n");        
+    TRACE("...done sending");
+    // printf("...done\n");        
     return ret;
 }
 
@@ -118,11 +168,11 @@ HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
  */
 template <typename T, typename Dummy = void, typename std::enable_if_t<has_send_children<T>::value, Dummy>* = nullptr>
 HOST inline Proxy<T> send(const Resource& location, T& obj, T* dest = nullptr) {
-    printf("Sending object %s @%x to device at %x\n", type_name<T>().c_str(), &obj, dest);
+    TRACE("Sending complex object {} @{} to device at {}", type_name<T>().c_str(), fmt::ptr(&obj), fmt::ptr(dest));
     auto dummy = obj.send_children(location); // function is expected to return an object of type obj with all pointers appropriately assigned to valid pointers on location
     Proxy<T> ret = _send_ignoring_children(location, dummy, dest);
-    printf("clearing...\n");
+    TRACE("... clearing dummy complex object");
     dummy.clear();
-    printf("...done\n");    
+    TRACE("... done sending");
     return ret;
 }
diff --git a/src/SignalManager.cpp b/src/SignalManager.cpp
index 08a87ac7f202d2f7674527bc32a21a26cc95e03e..4605268518bf77987d12b7574888f2e8a4131fe6 100644
--- a/src/SignalManager.cpp
+++ b/src/SignalManager.cpp
@@ -1,4 +1,5 @@
 #include "SignalManager.h"
+
 #include <cstdio>
 #include <cstdlib>
 #ifdef SIGNAL
@@ -33,6 +34,9 @@ void SignalManager::segfault_handler(int sig, siginfo_t *info, void *secret)
 
 void SignalManager::manage_segfault() 
 {
+#ifdef USE_LOGGER
+    spdlog::set_level(spdlog::level::trace);
+#endif
 	struct sigaction sa;
 
 	sa.sa_sigaction = segfault_handler;
@@ -44,6 +48,10 @@ void SignalManager::manage_segfault()
 
 #else
 void SignalManager::segfault_handler(int sig, siginfo_t *info, void *secret) {}
-void SignalManager::manage_segfault() {}
+void SignalManager::manage_segfault() {
+#ifdef USE_LOGGER
+    spdlog::set_level(spdlog::level::trace);
+#endif
+}
 
 #endif
diff --git a/src/SignalManager.h b/src/SignalManager.h
index c58ad2d9dface2c2d1e45a2b66dbc50a2aa676eb..0b9afb8ade5f583d1ff1d227ebbc7984301f80d8 100644
--- a/src/SignalManager.h
+++ b/src/SignalManager.h
@@ -6,6 +6,39 @@
 #ifndef SIGNALMANAGER_H_
 #define SIGNALMANAGER_H_
 
+#ifdef USE_LOGGER
+
+#define FMT_HEADER_ONLY
+#include <spdlog/fmt/bundled/core.h>
+#include <spdlog/fmt/bundled/format.h>
+#include <spdlog/spdlog.h>
+
+#ifndef SPDLOG_ACTIVE_LEVEL
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_TRACE
+#endif
+
+#define TRACE(...) SPDLOG_TRACE(__VA_ARGS__)
+#define DEBUG(...) SPDLOG_DEBUG(__VA_ARGS__)
+// #define DEBUG(...) spdlog::debug(__VA_ARGS__)
+#define INFO(...) SPDLOG_INFO(__VA_ARGS__)
+#define WARN(...) SPDLOG_WARN(__VA_ARGS__)
+#define ERROR(...) SPDLOG_ERROR(__VA_ARGS__)
+#define CRITICAL(...) SPDLOG_CRITICAL(__VA_ARGS__)
+// spdlog::set_level(spdlog::level::trace);
+
+#else
+
+// Disable logger macros
+// NOTE to developers: only use the macros below for logging, only in host code
+#define TRACE(...)
+#define DEBUG(...)
+#define INFO(...)
+#define WARN(...)
+#define ERROR(...)
+#define CRITICAL(...)
+
+#endif
+
 // see http://www.linuxjournal.com/files/linuxjournal.com/linuxjournal/articles/063/6391/6391l3.html
 #include <csignal>
 #include <execinfo.h>
diff --git a/src/SimSystem.cpp b/src/SimSystem.cpp
index 20e87bfa4009cb3d92ba2b3d0c5146119e5e4f70..74df51316b9009dc4e839f3d5eb7b422122bb28a 100644
--- a/src/SimSystem.cpp
+++ b/src/SimSystem.cpp
@@ -17,7 +17,7 @@ void CellDecomposer::decompose(SimSystem& sys, ResourceCollection& resources) {
     Vector3 dr = max-min;
 
     // For starters, distribute patches uniformly among available resources
-    Vector3 n_p_v = (dr / cutoff).element_floor(); // ordered z-fast
+    Vector3 n_p_v = (dr / cutoff).element_floor();
     size_t n_r = resources.resources.size();
 
     size_t n_p = static_cast<size_t>(round(n_p_v[0]*n_p_v[1]*n_p_v[2]));
diff --git a/src/Tests/CMakeLists.txt b/src/Tests/CMakeLists.txt
index 4b2055025b40384e6a98abc2f04296bf1b582805..147d2fb19db29dd35ea71424ab107286173bd8d8 100644
--- a/src/Tests/CMakeLists.txt
+++ b/src/Tests/CMakeLists.txt
@@ -27,6 +27,12 @@ endif()
 if(USE_NVTX)
   target_link_libraries(arbd_tests PUBLIC nvToolsExt)
 endif()
+if(USE_LOGGER)
+  add_definitions(-DUSE_LOGGER)
+  target_include_directories(arbd_tests PRIVATE ${spdlog_DIR})
+  target_link_libraries(arbd_tests PRIVATE spdlog::spdlog_header_only)
+  add_compile_definitions(SPDLOG_ACTIVE_LEVEL=${SPDLOG_LEVEL})
+endif()
 
 ## catch_discover_tests("${PROJECT_NAME}_TESTS")
 
diff --git a/src/Tests/catch_boiler.h b/src/Tests/catch_boiler.h
index 571424f62e6af61b25e50a61575b4ba05dcd5fdf..03b7f10d043e8bf7ea82678cd72214ffe1311189 100644
--- a/src/Tests/catch_boiler.h
+++ b/src/Tests/catch_boiler.h
@@ -8,7 +8,6 @@
 #include <nvfunctional>
 
 #include "../type_name.h"
-
 /* #include <catch2/catch_tostring.hpp> */
 /* namespace Catch { */
 /*     template<typename T, bool b1, bool b2> */
@@ -34,6 +33,7 @@ namespace Tests {
 namespace Tests {\
 template<typename Op_t, typename R, typename ...T>\
     void run_trial( std::string name, R expected_result, T...args) {\
+    SignalManager::manage_segfault();\
 	R *gpu_result_d, gpu_result, cpu_result;\
 	cpu_result = Op_t::op(args...);\
 	cudaMalloc((void **)&gpu_result_d, sizeof(R));\